You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2019/11/26 05:33:42 UTC
[incubator-tvm-test] branch asf-site updated: Build at Mon Nov 25 21:33:33 PST 2019

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-tvm-test.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 2480acc  Build at Mon Nov 25 21:33:33 PST 2019
2480acc is described below

commit 2480acc820a9cd11f4d3cc6ce207108b153829f9
Author: tqchen <tq...@gmail.com>
AuthorDate: Mon Nov 25 21:33:33 2019 -0800

    Build at Mon Nov 25 21:33:33 PST 2019
---
 2017/08/17/tvm-release-announcement.html           |   12 +-
 ...s-with-TVM-A-Depthwise-Convolution-Example.html |   12 +-
 2017/10/06/nnvm-compiler-announcement.html         |   12 +-
 ...s-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html |   12 +-
 2017/11/08/android-rpc-introduction.html           |   12 +-
 2018/01/16/opt-mali-gpu.html                       |   12 +-
 2018/03/12/webgl.html                              |   12 +-
 2018/03/23/nmt-transformer-optimize.html           |   12 +-
 2018/07/12/vta-release-announcement.html           |   12 +-
 2018/08/10/DLPack-Bridge.html                      |   12 +-
 2018/10/03/auto-opt-all.html                       |   12 +-
 2018/10/09/ml-in-tees.html                         |   12 +-
 2018/12/18/lowprecision-conv.html                  |   12 +-
 2019/01/19/Golang.html                             |   12 +-
 2019/03/18/tvm-apache-announcement.html            |   12 +-
 2019/04/29/opt-cuda-quantized.html                 |   12 +-
 2019/05/30/pytorch-frontend.html                   |   12 +-
 404.html                                           |    1 +
 CNAME                                              |    1 -
 README.md                                          |   12 -
 about.html                                         |   12 +-
 .../bootstrap/css/bootstrap.2.2.2.min.css          |  782 +++++
 .../bootstrap/img/glyphicons-halflings-white.png   |  Bin 0 -> 8777 bytes
 .../bootstrap/img/glyphicons-halflings.png         |  Bin 0 -> 12799 bytes
 .../themes/custom-twitter/css/1.4.0/bootstrap.css  |  356 ++
 assets/themes/custom-twitter/css/style.css         |  417 +++
 atom.xml                                           | 3607 ++++++++++++++++++++
 blog.html                                          |   12 +-
 about.html => categories.html                      |   39 +-
 community.html                                     |   12 +-
 index.html                                         |  159 +
 rss.xml                                            | 3606 +++++++++++++++++++
 sitemap.txt                                        |   29 +
 about.html => tags.html                            |   40 +-
 vta.html                                           |   12 +-
 35 files changed, 9118 insertions(+), 183 deletions(-)

diff --git a/2017/08/17/tvm-release-announcement.html b/2017/08/17/tvm-release-announcement.html
index 206004b..edf30a5 100644
--- a/2017/08/17/tvm-release-announcement.html
+++ b/2017/08/17/tvm-release-announcement.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html b/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html
index feb4ce3..2a5bf76 100644
--- a/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html
+++ b/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2017/10/06/nnvm-compiler-announcement.html b/2017/10/06/nnvm-compiler-announcement.html
index eeb55ee..ac03251 100644
--- a/2017/10/06/nnvm-compiler-announcement.html
+++ b/2017/10/06/nnvm-compiler-announcement.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html b/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
index d0a36c0..3e824c0 100644
--- a/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
+++ b/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2017/11/08/android-rpc-introduction.html b/2017/11/08/android-rpc-introduction.html
index 5eae692..1485184 100644
--- a/2017/11/08/android-rpc-introduction.html
+++ b/2017/11/08/android-rpc-introduction.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/01/16/opt-mali-gpu.html b/2018/01/16/opt-mali-gpu.html
index 06ec2a7..577da28 100644
--- a/2018/01/16/opt-mali-gpu.html
+++ b/2018/01/16/opt-mali-gpu.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/03/12/webgl.html b/2018/03/12/webgl.html
index a295bc0..56a855d 100644
--- a/2018/03/12/webgl.html
+++ b/2018/03/12/webgl.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/03/23/nmt-transformer-optimize.html b/2018/03/23/nmt-transformer-optimize.html
index e2a99c6..6d1c4f7 100644
--- a/2018/03/23/nmt-transformer-optimize.html
+++ b/2018/03/23/nmt-transformer-optimize.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/07/12/vta-release-announcement.html b/2018/07/12/vta-release-announcement.html
index 02974c5..dfb7dec 100644
--- a/2018/07/12/vta-release-announcement.html
+++ b/2018/07/12/vta-release-announcement.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/08/10/DLPack-Bridge.html b/2018/08/10/DLPack-Bridge.html
index 999e84c..babd031 100644
--- a/2018/08/10/DLPack-Bridge.html
+++ b/2018/08/10/DLPack-Bridge.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/10/03/auto-opt-all.html b/2018/10/03/auto-opt-all.html
index 233f02c..c86b295 100644
--- a/2018/10/03/auto-opt-all.html
+++ b/2018/10/03/auto-opt-all.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/10/09/ml-in-tees.html b/2018/10/09/ml-in-tees.html
index 7c5dd7f..ca4ef34 100644
--- a/2018/10/09/ml-in-tees.html
+++ b/2018/10/09/ml-in-tees.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2018/12/18/lowprecision-conv.html b/2018/12/18/lowprecision-conv.html
index 238d046..1689a3e 100644
--- a/2018/12/18/lowprecision-conv.html
+++ b/2018/12/18/lowprecision-conv.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2019/01/19/Golang.html b/2019/01/19/Golang.html
index bfc4a9c..dbce9e3 100644
--- a/2019/01/19/Golang.html
+++ b/2019/01/19/Golang.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2019/03/18/tvm-apache-announcement.html b/2019/03/18/tvm-apache-announcement.html
index fc8c8b0..f3f38ce 100644
--- a/2019/03/18/tvm-apache-announcement.html
+++ b/2019/03/18/tvm-apache-announcement.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2019/04/29/opt-cuda-quantized.html b/2019/04/29/opt-cuda-quantized.html
index aa12995..63e48b9 100644
--- a/2019/04/29/opt-cuda-quantized.html
+++ b/2019/04/29/opt-cuda-quantized.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/2019/05/30/pytorch-frontend.html b/2019/05/30/pytorch-frontend.html
index 767eeb3..ac2ef46 100644
--- a/2019/05/30/pytorch-frontend.html
+++ b/2019/05/30/pytorch-frontend.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/404.html b/404.html
new file mode 100644
index 0000000..6904bcd
--- /dev/null
+++ b/404.html
@@ -0,0 +1 @@
+Sorry this page does not exist =(
diff --git a/CNAME b/CNAME
deleted file mode 100644
index 4247f3d..0000000
--- a/CNAME
+++ /dev/null
@@ -1 +0,0 @@
-tvm.ai
diff --git a/README.md b/README.md
deleted file mode 100644
index 226efab..0000000
--- a/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# TVM Project Homepage
-
-## Serve Locally
-
-```bash
-./serve_local.sh
-```
-
-## Deployment
-
-We use the script [scripts/deploy_to_asf_site.sh](scripts/deploy_to_asf_site.sh)
-to generate and deploy content to the asf-site branch.
diff --git a/about.html b/about.html
index 749691f..50bcd6b 100644
--- a/about.html
+++ b/about.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li class="active"><a href="https://tvm.ai/about" class="active">About</a></li>
+      	<li class="active"><a href="https://tvm.apache.org/about" class="active">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/assets/themes/custom-twitter/bootstrap/css/bootstrap.2.2.2.min.css b/assets/themes/custom-twitter/bootstrap/css/bootstrap.2.2.2.min.css
new file mode 100644
index 0000000..9f19ba6
--- /dev/null
+++ b/assets/themes/custom-twitter/bootstrap/css/bootstrap.2.2.2.min.css
@@ -0,0 +1,782 @@
+/*!
+ * Bootstrap v2.2.2
+ *
+ * Copyright 2012 Twitter, Inc
+ * Licensed under the Apache License v2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Designed and built with all the love in the world @twitter by @mdo and @fat.
+ */
+.clearfix{*zoom:1;}.clearfix:before,.clearfix:after{display:table;content:"";line-height:0;}
+.clearfix:after{clear:both;}
+.hide-text{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0;}
+.input-block-level{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;}
+article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block;}
+audio,canvas,video{display:inline-block;*display:inline;*zoom:1;}
+audio:not([controls]){display:none;}
+html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;}
+a:focus{outline:thin dotted #333;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px;}
+a:hover,a:active{outline:0;}
+sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline;}
+sup{top:-0.5em;}
+sub{bottom:-0.25em;}
+img{max-width:100%;width:auto\9;height:auto;vertical-align:middle;border:0;-ms-interpolation-mode:bicubic;}
+#map_canvas img,.google-maps img{max-width:none;}
+button,input,select,textarea{margin:0;font-size:100%;vertical-align:middle;}
+button,input{*overflow:visible;line-height:normal;}
+button::-moz-focus-inner,input::-moz-focus-inner{padding:0;border:0;}
+button,html input[type="button"],input[type="reset"],input[type="submit"]{-webkit-appearance:button;cursor:pointer;}
+label,select,button,input[type="button"],input[type="reset"],input[type="submit"],input[type="radio"],input[type="checkbox"]{cursor:pointer;}
+input[type="search"]{-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;-webkit-appearance:textfield;}
+input[type="search"]::-webkit-search-decoration,input[type="search"]::-webkit-search-cancel-button{-webkit-appearance:none;}
+textarea{overflow:auto;vertical-align:top;}
+@media print{*{text-shadow:none !important;color:#000 !important;background:transparent !important;box-shadow:none !important;} a,a:visited{text-decoration:underline;} a[href]:after{content:" (" attr(href) ")";} abbr[title]:after{content:" (" attr(title) ")";} .ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:"";} pre,blockquote{border:1px solid #999;page-break-inside:avoid;} thead{display:table-header-group;} tr,img{page-break-inside:avoid;} img{max-width:100% !importa [...]
+a{color:#0088cc;text-decoration:none;}
+a:hover{color:#005580;text-decoration:underline;}
+.img-rounded{-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;}
+.img-polaroid{padding:4px;background-color:#fff;border:1px solid #ccc;border:1px solid rgba(0, 0, 0, 0.2);-webkit-box-shadow:0 1px 3px rgba(0, 0, 0, 0.1);-moz-box-shadow:0 1px 3px rgba(0, 0, 0, 0.1);box-shadow:0 1px 3px rgba(0, 0, 0, 0.1);}
+.img-circle{-webkit-border-radius:500px;-moz-border-radius:500px;border-radius:500px;}
+.row{margin-left:-20px;*zoom:1;}.row:before,.row:after{display:table;content:"";line-height:0;}
+.row:after{clear:both;}
+[class*="span"]{float:left;min-height:1px;margin-left:20px;}
+.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:940px;}
+.span12{width:940px;}
+.span11{width:860px;}
+.span10{width:780px;}
+.span9{width:700px;}
+.span8{width:620px;}
+.span7{width:540px;}
+.span6{width:460px;}
+.span5{width:380px;}
+.span4{width:300px;}
+.span3{width:220px;}
+.span2{width:140px;}
+.span1{width:60px;}
+.offset12{margin-left:980px;}
+.offset11{margin-left:900px;}
+.offset10{margin-left:820px;}
+.offset9{margin-left:740px;}
+.offset8{margin-left:660px;}
+.offset7{margin-left:580px;}
+.offset6{margin-left:500px;}
+.offset5{margin-left:420px;}
+.offset4{margin-left:340px;}
+.offset3{margin-left:260px;}
+.offset2{margin-left:180px;}
+.offset1{margin-left:100px;}
+.row-fluid{width:100%;*zoom:1;}.row-fluid:before,.row-fluid:after{display:table;content:"";line-height:0;}
+.row-fluid:after{clear:both;}
+.row-fluid [class*="span"]{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;float:left;margin-left:2.127659574468085%;*margin-left:2.074468085106383%;}
+.row-fluid [class*="span"]:first-child{margin-left:0;}
+.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.127659574468085%;}
+.row-fluid .span12{width:100%;*width:99.94680851063829%;}
+.row-fluid .span11{width:91.48936170212765%;*width:91.43617021276594%;}
+.row-fluid .span10{width:82.97872340425532%;*width:82.92553191489361%;}
+.row-fluid .span9{width:74.46808510638297%;*width:74.41489361702126%;}
+.row-fluid .span8{width:65.95744680851064%;*width:65.90425531914893%;}
+.row-fluid .span7{width:57.44680851063829%;*width:57.39361702127659%;}
+.row-fluid .span6{width:48.93617021276595%;*width:48.88297872340425%;}
+.row-fluid .span5{width:40.42553191489362%;*width:40.37234042553192%;}
+.row-fluid .span4{width:31.914893617021278%;*width:31.861702127659576%;}
+.row-fluid .span3{width:23.404255319148934%;*width:23.351063829787233%;}
+.row-fluid .span2{width:14.893617021276595%;*width:14.840425531914894%;}
+.row-fluid .span1{width:6.382978723404255%;*width:6.329787234042553%;}
+.row-fluid .offset12{margin-left:104.25531914893617%;*margin-left:104.14893617021275%;}
+.row-fluid .offset12:first-child{margin-left:102.12765957446808%;*margin-left:102.02127659574467%;}
+.row-fluid .offset11{margin-left:95.74468085106382%;*margin-left:95.6382978723404%;}
+.row-fluid .offset11:first-child{margin-left:93.61702127659574%;*margin-left:93.51063829787232%;}
+.row-fluid .offset10{margin-left:87.23404255319149%;*margin-left:87.12765957446807%;}
+.row-fluid .offset10:first-child{margin-left:85.1063829787234%;*margin-left:84.99999999999999%;}
+.row-fluid .offset9{margin-left:78.72340425531914%;*margin-left:78.61702127659572%;}
+.row-fluid .offset9:first-child{margin-left:76.59574468085106%;*margin-left:76.48936170212764%;}
+.row-fluid .offset8{margin-left:70.2127659574468%;*margin-left:70.10638297872339%;}
+.row-fluid .offset8:first-child{margin-left:68.08510638297872%;*margin-left:67.9787234042553%;}
+.row-fluid .offset7{margin-left:61.70212765957446%;*margin-left:61.59574468085106%;}
+.row-fluid .offset7:first-child{margin-left:59.574468085106375%;*margin-left:59.46808510638297%;}
+.row-fluid .offset6{margin-left:53.191489361702125%;*margin-left:53.085106382978715%;}
+.row-fluid .offset6:first-child{margin-left:51.063829787234035%;*margin-left:50.95744680851063%;}
+.row-fluid .offset5{margin-left:44.68085106382979%;*margin-left:44.57446808510638%;}
+.row-fluid .offset5:first-child{margin-left:42.5531914893617%;*margin-left:42.4468085106383%;}
+.row-fluid .offset4{margin-left:36.170212765957444%;*margin-left:36.06382978723405%;}
+.row-fluid .offset4:first-child{margin-left:34.04255319148936%;*margin-left:33.93617021276596%;}
+.row-fluid .offset3{margin-left:27.659574468085104%;*margin-left:27.5531914893617%;}
+.row-fluid .offset3:first-child{margin-left:25.53191489361702%;*margin-left:25.425531914893618%;}
+.row-fluid .offset2{margin-left:19.148936170212764%;*margin-left:19.04255319148936%;}
+.row-fluid .offset2:first-child{margin-left:17.02127659574468%;*margin-left:16.914893617021278%;}
+.row-fluid .offset1{margin-left:10.638297872340425%;*margin-left:10.53191489361702%;}
+.row-fluid .offset1:first-child{margin-left:8.51063829787234%;*margin-left:8.404255319148938%;}
+[class*="span"].hide,.row-fluid [class*="span"].hide{display:none;}
+[class*="span"].pull-right,.row-fluid [class*="span"].pull-right{float:right;}
+.container{margin-right:auto;margin-left:auto;*zoom:1;}.container:before,.container:after{display:table;content:"";line-height:0;}
+.container:after{clear:both;}
+.container-fluid{padding-right:20px;padding-left:20px;*zoom:1;}.container-fluid:before,.container-fluid:after{display:table;content:"";line-height:0;}
+.container-fluid:after{clear:both;}
+p{margin:0 0 10px;}
+.lead{margin-bottom:20px;font-size:21px;font-weight:200;line-height:30px;}
+small{font-size:85%;}
+strong{font-weight:bold;}
+em{font-style:italic;}
+cite{font-style:normal;}
+.muted{color:#999999;}
+a.muted:hover{color:#808080;}
+.text-warning{color:#c09853;}
+a.text-warning:hover{color:#a47e3c;}
+.text-error{color:#b94a48;}
+a.text-error:hover{color:#953b39;}
+.text-info{color:#3a87ad;}
+a.text-info:hover{color:#2d6987;}
+.text-success{color:#468847;}
+a.text-success:hover{color:#356635;}
+h1,h2,h3,h4,h5,h6{margin:10px 0;font-family:inherit;font-weight:bold;line-height:20px;color:inherit;text-rendering:optimizelegibility;}h1 small,h2 small,h3 small,h4 small,h5 small,h6 small{font-weight:normal;line-height:1;color:#999999;}
+h1,h2,h3{line-height:40px;}
+h1{font-size:38.5px;}
+h2{font-size:31.5px;}
+h3{font-size:24.5px;}
+h4{font-size:17.5px;}
+h5{font-size:14px;}
+h6{font-size:11.9px;}
+h1 small{font-size:24.5px;}
+h2 small{font-size:17.5px;}
+h3 small{font-size:14px;}
+h4 small{font-size:14px;}
+.page-header{padding-bottom:9px;margin:20px 0 30px;border-bottom:1px solid #eeeeee;}
+ul,ol{padding:0;margin:0 0 10px 25px;}
+ul ul,ul ol,ol ol,ol ul{margin-bottom:0;}
+li{line-height:20px;}
+ul.unstyled,ol.unstyled{margin-left:0;list-style:none;}
+ul.inline,ol.inline{margin-left:0;list-style:none;}ul.inline >li,ol.inline >li{display:inline-block;padding-left:5px;padding-right:5px;}
+dl{margin-bottom:20px;}
+dt,dd{line-height:20px;}
+dt{font-weight:bold;}
+dd{margin-left:10px;}
+.dl-horizontal{*zoom:1;}.dl-horizontal:before,.dl-horizontal:after{display:table;content:"";line-height:0;}
+.dl-horizontal:after{clear:both;}
+.dl-horizontal dt{float:left;width:160px;clear:left;text-align:right;overflow:hidden;text-overflow:ellipsis;white-space:nowrap;}
+.dl-horizontal dd{margin-left:180px;}
+hr{margin:20px 0;border:0;border-top:1px solid #eeeeee;border-bottom:1px solid #ffffff;}
+abbr[title],abbr[data-original-title]{cursor:help;border-bottom:1px dotted #999999;}
+abbr.initialism{font-size:90%;text-transform:uppercase;}
+blockquote{padding:0 0 0 15px;margin:0 0 20px;border-left:5px solid #eeeeee;}blockquote p{margin-bottom:0;font-size:16px;font-weight:300;line-height:25px;}
+blockquote small{display:block;line-height:20px;color:#999999;}blockquote small:before{content:'\2014 \00A0';}
+blockquote.pull-right{float:right;padding-right:15px;padding-left:0;border-right:5px solid #eeeeee;border-left:0;}blockquote.pull-right p,blockquote.pull-right small{text-align:right;}
+blockquote.pull-right small:before{content:'';}
+blockquote.pull-right small:after{content:'\00A0 \2014';}
+q:before,q:after,blockquote:before,blockquote:after{content:"";}
+address{display:block;margin-bottom:20px;font-style:normal;line-height:20px;}
+code,pre{padding:0 3px 2px;font-family:Monaco,Menlo,Consolas,"Courier New",monospace;font-size:12px;color:#333333;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}
+code{padding:2px 4px;color:#d14;background-color:#f7f7f9;border:1px solid #e1e1e8;white-space:nowrap;}
+pre{display:block;padding:9.5px;margin:0 0 10px;font-size:13px;line-height:20px;word-break:break-all;word-wrap:break-word;white-space:pre;white-space:pre-wrap;background-color:#f5f5f5;border:1px solid #ccc;border:1px solid rgba(0, 0, 0, 0.15);-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;}pre.prettyprint{margin-bottom:20px;}
+pre code{padding:0;color:inherit;white-space:pre;white-space:pre-wrap;background-color:transparent;border:0;}
+.pre-scrollable{max-height:340px;overflow-y:scroll;}
+.label,.badge{display:inline-block;padding:2px 4px;font-size:11.844px;font-weight:bold;line-height:14px;color:#ffffff;vertical-align:baseline;white-space:nowrap;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#999999;}
+.label{-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}
+.badge{padding-left:9px;padding-right:9px;-webkit-border-radius:9px;-moz-border-radius:9px;border-radius:9px;}
+.label:empty,.badge:empty{display:none;}
+a.label:hover,a.badge:hover{color:#ffffff;text-decoration:none;cursor:pointer;}
+.label-important,.badge-important{background-color:#b94a48;}
+.label-important[href],.badge-important[href]{background-color:#953b39;}
+.label-warning,.badge-warning{background-color:#f89406;}
+.label-warning[href],.badge-warning[href]{background-color:#c67605;}
+.label-success,.badge-success{background-color:#468847;}
+.label-success[href],.badge-success[href]{background-color:#356635;}
+.label-info,.badge-info{background-color:#3a87ad;}
+.label-info[href],.badge-info[href]{background-color:#2d6987;}
+.label-inverse,.badge-inverse{background-color:#333333;}
+.label-inverse[href],.badge-inverse[href]{background-color:#1a1a1a;}
+.btn .label,.btn .badge{position:relative;top:-1px;}
+.btn-mini .label,.btn-mini .badge{top:0;}
+table{max-width:100%;background-color:transparent;border-collapse:collapse;border-spacing:0;}
+.table{width:100%;margin-bottom:20px;}.table th,.table td{padding:8px;line-height:20px;text-align:left;vertical-align:top;border-top:1px solid #dddddd;}
+.table th{font-weight:bold;}
+.table thead th{vertical-align:bottom;}
+.table caption+thead tr:first-child th,.table caption+thead tr:first-child td,.table colgroup+thead tr:first-child th,.table colgroup+thead tr:first-child td,.table thead:first-child tr:first-child th,.table thead:first-child tr:first-child td{border-top:0;}
+.table tbody+tbody{border-top:2px solid #dddddd;}
+.table .table{background-color:#ffffff;}
+.table-condensed th,.table-condensed td{padding:4px 5px;}
+.table-bordered{border:1px solid #dddddd;border-collapse:separate;*border-collapse:collapse;border-left:0;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;}.table-bordered th,.table-bordered td{border-left:1px solid #dddddd;}
+.table-bordered caption+thead tr:first-child th,.table-bordered caption+tbody tr:first-child th,.table-bordered caption+tbody tr:first-child td,.table-bordered colgroup+thead tr:first-child th,.table-bordered colgroup+tbody tr:first-child th,.table-bordered colgroup+tbody tr:first-child td,.table-bordered thead:first-child tr:first-child th,.table-bordered tbody:first-child tr:first-child th,.table-bordered tbody:first-child tr:first-child td{border-top:0;}
+.table-bordered thead:first-child tr:first-child>th:first-child,.table-bordered tbody:first-child tr:first-child>td:first-child{-webkit-border-top-left-radius:4px;-moz-border-radius-topleft:4px;border-top-left-radius:4px;}
+.table-bordered thead:first-child tr:first-child>th:last-child,.table-bordered tbody:first-child tr:first-child>td:last-child{-webkit-border-top-right-radius:4px;-moz-border-radius-topright:4px;border-top-right-radius:4px;}
+.table-bordered thead:last-child tr:last-child>th:first-child,.table-bordered tbody:last-child tr:last-child>td:first-child,.table-bordered tfoot:last-child tr:last-child>td:first-child{-webkit-border-bottom-left-radius:4px;-moz-border-radius-bottomleft:4px;border-bottom-left-radius:4px;}
+.table-bordered thead:last-child tr:last-child>th:last-child,.table-bordered tbody:last-child tr:last-child>td:last-child,.table-bordered tfoot:last-child tr:last-child>td:last-child{-webkit-border-bottom-right-radius:4px;-moz-border-radius-bottomright:4px;border-bottom-right-radius:4px;}
+.table-bordered tfoot+tbody:last-child tr:last-child td:first-child{-webkit-border-bottom-left-radius:0;-moz-border-radius-bottomleft:0;border-bottom-left-radius:0;}
+.table-bordered tfoot+tbody:last-child tr:last-child td:last-child{-webkit-border-bottom-right-radius:0;-moz-border-radius-bottomright:0;border-bottom-right-radius:0;}
+.table-bordered caption+thead tr:first-child th:first-child,.table-bordered caption+tbody tr:first-child td:first-child,.table-bordered colgroup+thead tr:first-child th:first-child,.table-bordered colgroup+tbody tr:first-child td:first-child{-webkit-border-top-left-radius:4px;-moz-border-radius-topleft:4px;border-top-left-radius:4px;}
+.table-bordered caption+thead tr:first-child th:last-child,.table-bordered caption+tbody tr:first-child td:last-child,.table-bordered colgroup+thead tr:first-child th:last-child,.table-bordered colgroup+tbody tr:first-child td:last-child{-webkit-border-top-right-radius:4px;-moz-border-radius-topright:4px;border-top-right-radius:4px;}
+.table-striped tbody>tr:nth-child(odd)>td,.table-striped tbody>tr:nth-child(odd)>th{background-color:#f9f9f9;}
+.table-hover tbody tr:hover td,.table-hover tbody tr:hover th{background-color:#f5f5f5;}
+table td[class*="span"],table th[class*="span"],.row-fluid table td[class*="span"],.row-fluid table th[class*="span"]{display:table-cell;float:none;margin-left:0;}
+.table td.span1,.table th.span1{float:none;width:44px;margin-left:0;}
+.table td.span2,.table th.span2{float:none;width:124px;margin-left:0;}
+.table td.span3,.table th.span3{float:none;width:204px;margin-left:0;}
+.table td.span4,.table th.span4{float:none;width:284px;margin-left:0;}
+.table td.span5,.table th.span5{float:none;width:364px;margin-left:0;}
+.table td.span6,.table th.span6{float:none;width:444px;margin-left:0;}
+.table td.span7,.table th.span7{float:none;width:524px;margin-left:0;}
+.table td.span8,.table th.span8{float:none;width:604px;margin-left:0;}
+.table td.span9,.table th.span9{float:none;width:684px;margin-left:0;}
+.table td.span10,.table th.span10{float:none;width:764px;margin-left:0;}
+.table td.span11,.table th.span11{float:none;width:844px;margin-left:0;}
+.table td.span12,.table th.span12{float:none;width:924px;margin-left:0;}
+.table tbody tr.success td{background-color:#dff0d8;}
+.table tbody tr.error td{background-color:#f2dede;}
+.table tbody tr.warning td{background-color:#fcf8e3;}
+.table tbody tr.info td{background-color:#d9edf7;}
+.table-hover tbody tr.success:hover td{background-color:#d0e9c6;}
+.table-hover tbody tr.error:hover td{background-color:#ebcccc;}
+.table-hover tbody tr.warning:hover td{background-color:#faf2cc;}
+.table-hover tbody tr.info:hover td{background-color:#c4e3f3;}
+form{margin:0 0 20px;}
+fieldset{padding:0;margin:0;border:0;}
+legend{display:block;width:100%;padding:0;margin-bottom:20px;font-size:21px;line-height:40px;color:#333333;border:0;border-bottom:1px solid #e5e5e5;}legend small{font-size:15px;color:#999999;}
+label,input,button,select,textarea{font-size:14px;font-weight:normal;line-height:20px;}
+input,button,select,textarea{font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;}
+label{display:block;margin-bottom:5px;}
+select,textarea,input[type="text"],input[type="password"],input[type="datetime"],input[type="datetime-local"],input[type="date"],input[type="month"],input[type="time"],input[type="week"],input[type="number"],input[type="email"],input[type="url"],input[type="search"],input[type="tel"],input[type="color"],.uneditable-input{display:inline-block;height:20px;padding:4px 6px;margin-bottom:10px;font-size:14px;line-height:20px;color:#555555;-webkit-border-radius:4px;-moz-border-radius:4px;border [...]
+input,textarea,.uneditable-input{width:206px;}
+textarea{height:auto;}
+textarea,input[type="text"],input[type="password"],input[type="datetime"],input[type="datetime-local"],input[type="date"],input[type="month"],input[type="time"],input[type="week"],input[type="number"],input[type="email"],input[type="url"],input[type="search"],input[type="tel"],input[type="color"],.uneditable-input{background-color:#ffffff;border:1px solid #cccccc;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);-moz-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);box-shadow:inset  [...]
+input[type="radio"],input[type="checkbox"]{margin:4px 0 0;*margin-top:0;margin-top:1px \9;line-height:normal;}
+input[type="file"],input[type="image"],input[type="submit"],input[type="reset"],input[type="button"],input[type="radio"],input[type="checkbox"]{width:auto;}
+select,input[type="file"]{height:30px;*margin-top:4px;line-height:30px;}
+select{width:220px;border:1px solid #cccccc;background-color:#ffffff;}
+select[multiple],select[size]{height:auto;}
+select:focus,input[type="file"]:focus,input[type="radio"]:focus,input[type="checkbox"]:focus{outline:thin dotted #333;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px;}
+.uneditable-input,.uneditable-textarea{color:#999999;background-color:#fcfcfc;border-color:#cccccc;-webkit-box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.025);-moz-box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.025);box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.025);cursor:not-allowed;}
+.uneditable-input{overflow:hidden;white-space:nowrap;}
+.uneditable-textarea{width:auto;height:auto;}
+input:-moz-placeholder,textarea:-moz-placeholder{color:#999999;}
+input:-ms-input-placeholder,textarea:-ms-input-placeholder{color:#999999;}
+input::-webkit-input-placeholder,textarea::-webkit-input-placeholder{color:#999999;}
+.radio,.checkbox{min-height:20px;padding-left:20px;}
+.radio input[type="radio"],.checkbox input[type="checkbox"]{float:left;margin-left:-20px;}
+.controls>.radio:first-child,.controls>.checkbox:first-child{padding-top:5px;}
+.radio.inline,.checkbox.inline{display:inline-block;padding-top:5px;margin-bottom:0;vertical-align:middle;}
+.radio.inline+.radio.inline,.checkbox.inline+.checkbox.inline{margin-left:10px;}
+.input-mini{width:60px;}
+.input-small{width:90px;}
+.input-medium{width:150px;}
+.input-large{width:210px;}
+.input-xlarge{width:270px;}
+.input-xxlarge{width:530px;}
+input[class*="span"],select[class*="span"],textarea[class*="span"],.uneditable-input[class*="span"],.row-fluid input[class*="span"],.row-fluid select[class*="span"],.row-fluid textarea[class*="span"],.row-fluid .uneditable-input[class*="span"]{float:none;margin-left:0;}
+.input-append input[class*="span"],.input-append .uneditable-input[class*="span"],.input-prepend input[class*="span"],.input-prepend .uneditable-input[class*="span"],.row-fluid input[class*="span"],.row-fluid select[class*="span"],.row-fluid textarea[class*="span"],.row-fluid .uneditable-input[class*="span"],.row-fluid .input-prepend [class*="span"],.row-fluid .input-append [class*="span"]{display:inline-block;}
+input,textarea,.uneditable-input{margin-left:0;}
+.controls-row [class*="span"]+[class*="span"]{margin-left:20px;}
+input.span12, textarea.span12, .uneditable-input.span12{width:926px;}
+input.span11, textarea.span11, .uneditable-input.span11{width:846px;}
+input.span10, textarea.span10, .uneditable-input.span10{width:766px;}
+input.span9, textarea.span9, .uneditable-input.span9{width:686px;}
+input.span8, textarea.span8, .uneditable-input.span8{width:606px;}
+input.span7, textarea.span7, .uneditable-input.span7{width:526px;}
+input.span6, textarea.span6, .uneditable-input.span6{width:446px;}
+input.span5, textarea.span5, .uneditable-input.span5{width:366px;}
+input.span4, textarea.span4, .uneditable-input.span4{width:286px;}
+input.span3, textarea.span3, .uneditable-input.span3{width:206px;}
+input.span2, textarea.span2, .uneditable-input.span2{width:126px;}
+input.span1, textarea.span1, .uneditable-input.span1{width:46px;}
+.controls-row{*zoom:1;}.controls-row:before,.controls-row:after{display:table;content:"";line-height:0;}
+.controls-row:after{clear:both;}
+.controls-row [class*="span"],.row-fluid .controls-row [class*="span"]{float:left;}
+.controls-row .checkbox[class*="span"],.controls-row .radio[class*="span"]{padding-top:5px;}
+input[disabled],select[disabled],textarea[disabled],input[readonly],select[readonly],textarea[readonly]{cursor:not-allowed;background-color:#eeeeee;}
+input[type="radio"][disabled],input[type="checkbox"][disabled],input[type="radio"][readonly],input[type="checkbox"][readonly]{background-color:transparent;}
+.control-group.warning .control-label,.control-group.warning .help-block,.control-group.warning .help-inline{color:#c09853;}
+.control-group.warning .checkbox,.control-group.warning .radio,.control-group.warning input,.control-group.warning select,.control-group.warning textarea{color:#c09853;}
+.control-group.warning input,.control-group.warning select,.control-group.warning textarea{border-color:#c09853;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);-moz-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);}.control-group.warning input:focus,.control-group.warning select:focus,.control-group.warning textarea:focus{border-color:#a47e3c;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075),0 0 6px #dbc59e;-moz-box-shadow:inse [...]
+.control-group.warning .input-prepend .add-on,.control-group.warning .input-append .add-on{color:#c09853;background-color:#fcf8e3;border-color:#c09853;}
+.control-group.error .control-label,.control-group.error .help-block,.control-group.error .help-inline{color:#b94a48;}
+.control-group.error .checkbox,.control-group.error .radio,.control-group.error input,.control-group.error select,.control-group.error textarea{color:#b94a48;}
+.control-group.error input,.control-group.error select,.control-group.error textarea{border-color:#b94a48;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);-moz-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);}.control-group.error input:focus,.control-group.error select:focus,.control-group.error textarea:focus{border-color:#953b39;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075),0 0 6px #d59392;-moz-box-shadow:inset 0 1px 1px  [...]
+.control-group.error .input-prepend .add-on,.control-group.error .input-append .add-on{color:#b94a48;background-color:#f2dede;border-color:#b94a48;}
+.control-group.success .control-label,.control-group.success .help-block,.control-group.success .help-inline{color:#468847;}
+.control-group.success .checkbox,.control-group.success .radio,.control-group.success input,.control-group.success select,.control-group.success textarea{color:#468847;}
+.control-group.success input,.control-group.success select,.control-group.success textarea{border-color:#468847;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);-moz-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);}.control-group.success input:focus,.control-group.success select:focus,.control-group.success textarea:focus{border-color:#356635;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075),0 0 6px #7aba7b;-moz-box-shadow:inse [...]
+.control-group.success .input-prepend .add-on,.control-group.success .input-append .add-on{color:#468847;background-color:#dff0d8;border-color:#468847;}
+.control-group.info .control-label,.control-group.info .help-block,.control-group.info .help-inline{color:#3a87ad;}
+.control-group.info .checkbox,.control-group.info .radio,.control-group.info input,.control-group.info select,.control-group.info textarea{color:#3a87ad;}
+.control-group.info input,.control-group.info select,.control-group.info textarea{border-color:#3a87ad;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);-moz-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075);}.control-group.info input:focus,.control-group.info select:focus,.control-group.info textarea:focus{border-color:#2d6987;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.075),0 0 6px #7ab5d3;-moz-box-shadow:inset 0 1px 1px rgba(0 [...]
+.control-group.info .input-prepend .add-on,.control-group.info .input-append .add-on{color:#3a87ad;background-color:#d9edf7;border-color:#3a87ad;}
+input:focus:invalid,textarea:focus:invalid,select:focus:invalid{color:#b94a48;border-color:#ee5f5b;}input:focus:invalid:focus,textarea:focus:invalid:focus,select:focus:invalid:focus{border-color:#e9322d;-webkit-box-shadow:0 0 6px #f8b9b7;-moz-box-shadow:0 0 6px #f8b9b7;box-shadow:0 0 6px #f8b9b7;}
+.form-actions{padding:19px 20px 20px;margin-top:20px;margin-bottom:20px;background-color:#f5f5f5;border-top:1px solid #e5e5e5;*zoom:1;}.form-actions:before,.form-actions:after{display:table;content:"";line-height:0;}
+.form-actions:after{clear:both;}
+.help-block,.help-inline{color:#595959;}
+.help-block{display:block;margin-bottom:10px;}
+.help-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle;padding-left:5px;}
+.input-append,.input-prepend{margin-bottom:5px;font-size:0;white-space:nowrap;}.input-append input,.input-prepend input,.input-append select,.input-prepend select,.input-append .uneditable-input,.input-prepend .uneditable-input,.input-append .dropdown-menu,.input-prepend .dropdown-menu{font-size:14px;}
+.input-append input,.input-prepend input,.input-append select,.input-prepend select,.input-append .uneditable-input,.input-prepend .uneditable-input{position:relative;margin-bottom:0;*margin-left:0;vertical-align:top;-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0;}.input-append input:focus,.input-prepend input:focus,.input-append select:focus,.input-prepend select:focus,.input-append .uneditable-input:focus,.input-prepend .uneditable-input:focu [...]
+.input-append .add-on,.input-prepend .add-on{display:inline-block;width:auto;height:20px;min-width:16px;padding:4px 5px;font-size:14px;font-weight:normal;line-height:20px;text-align:center;text-shadow:0 1px 0 #ffffff;background-color:#eeeeee;border:1px solid #ccc;}
+.input-append .add-on,.input-prepend .add-on,.input-append .btn,.input-prepend .btn,.input-append .btn-group>.dropdown-toggle,.input-prepend .btn-group>.dropdown-toggle{vertical-align:top;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.input-append .active,.input-prepend .active{background-color:#a9dba9;border-color:#46a546;}
+.input-prepend .add-on,.input-prepend .btn{margin-right:-1px;}
+.input-prepend .add-on:first-child,.input-prepend .btn:first-child{-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px;}
+.input-append input,.input-append select,.input-append .uneditable-input{-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px;}.input-append input+.btn-group .btn:last-child,.input-append select+.btn-group .btn:last-child,.input-append .uneditable-input+.btn-group .btn:last-child{-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0;}
+.input-append .add-on,.input-append .btn,.input-append .btn-group{margin-left:-1px;}
+.input-append .add-on:last-child,.input-append .btn:last-child,.input-append .btn-group:last-child>.dropdown-toggle{-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0;}
+.input-prepend.input-append input,.input-prepend.input-append select,.input-prepend.input-append .uneditable-input{-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}.input-prepend.input-append input+.btn-group .btn,.input-prepend.input-append select+.btn-group .btn,.input-prepend.input-append .uneditable-input+.btn-group .btn{-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0;}
+.input-prepend.input-append .add-on:first-child,.input-prepend.input-append .btn:first-child{margin-right:-1px;-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px;}
+.input-prepend.input-append .add-on:last-child,.input-prepend.input-append .btn:last-child{margin-left:-1px;-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0;}
+.input-prepend.input-append .btn-group:first-child{margin-left:0;}
+input.search-query{padding-right:14px;padding-right:4px \9;padding-left:14px;padding-left:4px \9;margin-bottom:0;-webkit-border-radius:15px;-moz-border-radius:15px;border-radius:15px;}
+.form-search .input-append .search-query,.form-search .input-prepend .search-query{-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.form-search .input-append .search-query{-webkit-border-radius:14px 0 0 14px;-moz-border-radius:14px 0 0 14px;border-radius:14px 0 0 14px;}
+.form-search .input-append .btn{-webkit-border-radius:0 14px 14px 0;-moz-border-radius:0 14px 14px 0;border-radius:0 14px 14px 0;}
+.form-search .input-prepend .search-query{-webkit-border-radius:0 14px 14px 0;-moz-border-radius:0 14px 14px 0;border-radius:0 14px 14px 0;}
+.form-search .input-prepend .btn{-webkit-border-radius:14px 0 0 14px;-moz-border-radius:14px 0 0 14px;border-radius:14px 0 0 14px;}
+.form-search input,.form-inline input,.form-horizontal input,.form-search textarea,.form-inline textarea,.form-horizontal textarea,.form-search select,.form-inline select,.form-horizontal select,.form-search .help-inline,.form-inline .help-inline,.form-horizontal .help-inline,.form-search .uneditable-input,.form-inline .uneditable-input,.form-horizontal .uneditable-input,.form-search .input-prepend,.form-inline .input-prepend,.form-horizontal .input-prepend,.form-search .input-append,.fo [...]
+.form-search .hide,.form-inline .hide,.form-horizontal .hide{display:none;}
+.form-search label,.form-inline label,.form-search .btn-group,.form-inline .btn-group{display:inline-block;}
+.form-search .input-append,.form-inline .input-append,.form-search .input-prepend,.form-inline .input-prepend{margin-bottom:0;}
+.form-search .radio,.form-search .checkbox,.form-inline .radio,.form-inline .checkbox{padding-left:0;margin-bottom:0;vertical-align:middle;}
+.form-search .radio input[type="radio"],.form-search .checkbox input[type="checkbox"],.form-inline .radio input[type="radio"],.form-inline .checkbox input[type="checkbox"]{float:left;margin-right:3px;margin-left:0;}
+.control-group{margin-bottom:10px;}
+legend+.control-group{margin-top:20px;-webkit-margin-top-collapse:separate;}
+.form-horizontal .control-group{margin-bottom:20px;*zoom:1;}.form-horizontal .control-group:before,.form-horizontal .control-group:after{display:table;content:"";line-height:0;}
+.form-horizontal .control-group:after{clear:both;}
+.form-horizontal .control-label{float:left;width:160px;padding-top:5px;text-align:right;}
+.form-horizontal .controls{*display:inline-block;*padding-left:20px;margin-left:180px;*margin-left:0;}.form-horizontal .controls:first-child{*padding-left:180px;}
+.form-horizontal .help-block{margin-bottom:0;}
+.form-horizontal input+.help-block,.form-horizontal select+.help-block,.form-horizontal textarea+.help-block,.form-horizontal .uneditable-input+.help-block,.form-horizontal .input-prepend+.help-block,.form-horizontal .input-append+.help-block{margin-top:10px;}
+.form-horizontal .form-actions{padding-left:180px;}
+.btn{display:inline-block;*display:inline;*zoom:1;padding:4px 12px;margin-bottom:0;font-size:14px;line-height:20px;text-align:center;vertical-align:middle;cursor:pointer;color:#333333;text-shadow:0 1px 1px rgba(255, 255, 255, 0.75);background-color:#f5f5f5;background-image:-moz-linear-gradient(top, #ffffff, #e6e6e6);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#ffffff), to(#e6e6e6));background-image:-webkit-linear-gradient(top, #ffffff, #e6e6e6);background-image:-o-linear- [...]
+.btn:active,.btn.active{background-color:#cccccc \9;}
+.btn:first-child{*margin-left:0;}
+.btn:hover{color:#333333;text-decoration:none;background-position:0 -15px;-webkit-transition:background-position 0.1s linear;-moz-transition:background-position 0.1s linear;-o-transition:background-position 0.1s linear;transition:background-position 0.1s linear;}
+.btn:focus{outline:thin dotted #333;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px;}
+.btn.active,.btn:active{background-image:none;outline:0;-webkit-box-shadow:inset 0 2px 4px rgba(0,0,0,.15), 0 1px 2px rgba(0,0,0,.05);-moz-box-shadow:inset 0 2px 4px rgba(0,0,0,.15), 0 1px 2px rgba(0,0,0,.05);box-shadow:inset 0 2px 4px rgba(0,0,0,.15), 0 1px 2px rgba(0,0,0,.05);}
+.btn.disabled,.btn[disabled]{cursor:default;background-image:none;opacity:0.65;filter:alpha(opacity=65);-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;}
+.btn-large{padding:11px 19px;font-size:17.5px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;}
+.btn-large [class^="icon-"],.btn-large [class*=" icon-"]{margin-top:4px;}
+.btn-small{padding:2px 10px;font-size:11.9px;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}
+.btn-small [class^="icon-"],.btn-small [class*=" icon-"]{margin-top:0;}
+.btn-mini [class^="icon-"],.btn-mini [class*=" icon-"]{margin-top:-1px;}
+.btn-mini{padding:0 6px;font-size:10.5px;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}
+.btn-block{display:block;width:100%;padding-left:0;padding-right:0;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;}
+.btn-block+.btn-block{margin-top:5px;}
+input[type="submit"].btn-block,input[type="reset"].btn-block,input[type="button"].btn-block{width:100%;}
+.btn-primary.active,.btn-warning.active,.btn-danger.active,.btn-success.active,.btn-info.active,.btn-inverse.active{color:rgba(255, 255, 255, 0.75);}
+.btn{border-color:#c5c5c5;border-color:rgba(0, 0, 0, 0.15) rgba(0, 0, 0, 0.15) rgba(0, 0, 0, 0.25);}
+.btn-primary{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#006dcc;background-image:-moz-linear-gradient(top, #0088cc, #0044cc);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#0088cc), to(#0044cc));background-image:-webkit-linear-gradient(top, #0088cc, #0044cc);background-image:-o-linear-gradient(top, #0088cc, #0044cc);background-image:linear-gradient(to bottom, #0088cc, #0044cc);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gr [...]
+.btn-primary:active,.btn-primary.active{background-color:#003399 \9;}
+.btn-warning{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#faa732;background-image:-moz-linear-gradient(top, #fbb450, #f89406);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#fbb450), to(#f89406));background-image:-webkit-linear-gradient(top, #fbb450, #f89406);background-image:-o-linear-gradient(top, #fbb450, #f89406);background-image:linear-gradient(to bottom, #fbb450, #f89406);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gr [...]
+.btn-warning:active,.btn-warning.active{background-color:#c67605 \9;}
+.btn-danger{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#da4f49;background-image:-moz-linear-gradient(top, #ee5f5b, #bd362f);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#ee5f5b), to(#bd362f));background-image:-webkit-linear-gradient(top, #ee5f5b, #bd362f);background-image:-o-linear-gradient(top, #ee5f5b, #bd362f);background-image:linear-gradient(to bottom, #ee5f5b, #bd362f);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gra [...]
+.btn-danger:active,.btn-danger.active{background-color:#942a25 \9;}
+.btn-success{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#5bb75b;background-image:-moz-linear-gradient(top, #62c462, #51a351);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#62c462), to(#51a351));background-image:-webkit-linear-gradient(top, #62c462, #51a351);background-image:-o-linear-gradient(top, #62c462, #51a351);background-image:linear-gradient(to bottom, #62c462, #51a351);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gr [...]
+.btn-success:active,.btn-success.active{background-color:#408140 \9;}
+.btn-info{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#49afcd;background-image:-moz-linear-gradient(top, #5bc0de, #2f96b4);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#5bc0de), to(#2f96b4));background-image:-webkit-linear-gradient(top, #5bc0de, #2f96b4);background-image:-o-linear-gradient(top, #5bc0de, #2f96b4);background-image:linear-gradient(to bottom, #5bc0de, #2f96b4);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradi [...]
+.btn-info:active,.btn-info.active{background-color:#24748c \9;}
+.btn-inverse{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#363636;background-image:-moz-linear-gradient(top, #444444, #222222);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#444444), to(#222222));background-image:-webkit-linear-gradient(top, #444444, #222222);background-image:-o-linear-gradient(top, #444444, #222222);background-image:linear-gradient(to bottom, #444444, #222222);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gr [...]
+.btn-inverse:active,.btn-inverse.active{background-color:#080808 \9;}
+button.btn,input[type="submit"].btn{*padding-top:3px;*padding-bottom:3px;}button.btn::-moz-focus-inner,input[type="submit"].btn::-moz-focus-inner{padding:0;border:0;}
+button.btn.btn-large,input[type="submit"].btn.btn-large{*padding-top:7px;*padding-bottom:7px;}
+button.btn.btn-small,input[type="submit"].btn.btn-small{*padding-top:3px;*padding-bottom:3px;}
+button.btn.btn-mini,input[type="submit"].btn.btn-mini{*padding-top:1px;*padding-bottom:1px;}
+.btn-link,.btn-link:active,.btn-link[disabled]{background-color:transparent;background-image:none;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;}
+.btn-link{border-color:transparent;cursor:pointer;color:#0088cc;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.btn-link:hover{color:#005580;text-decoration:underline;background-color:transparent;}
+.btn-link[disabled]:hover{color:#333333;text-decoration:none;}
+[class^="icon-"],[class*=" icon-"]{display:inline-block;width:14px;height:14px;*margin-right:.3em;line-height:14px;vertical-align:text-top;background-image:url("../img/glyphicons-halflings.png");background-position:14px 14px;background-repeat:no-repeat;margin-top:1px;}
+.icon-white,.nav-pills>.active>a>[class^="icon-"],.nav-pills>.active>a>[class*=" icon-"],.nav-list>.active>a>[class^="icon-"],.nav-list>.active>a>[class*=" icon-"],.navbar-inverse .nav>.active>a>[class^="icon-"],.navbar-inverse .nav>.active>a>[class*=" icon-"],.dropdown-menu>li>a:hover>[class^="icon-"],.dropdown-menu>li>a:hover>[class*=" icon-"],.dropdown-menu>.active>a>[class^="icon-"],.dropdown-menu>.active>a>[class*=" icon-"],.dropdown-submenu:hover>a>[class^="icon-"],.dropdown-submen [...]
+.icon-glass{background-position:0 0;}
+.icon-music{background-position:-24px 0;}
+.icon-search{background-position:-48px 0;}
+.icon-envelope{background-position:-72px 0;}
+.icon-heart{background-position:-96px 0;}
+.icon-star{background-position:-120px 0;}
+.icon-star-empty{background-position:-144px 0;}
+.icon-user{background-position:-168px 0;}
+.icon-film{background-position:-192px 0;}
+.icon-th-large{background-position:-216px 0;}
+.icon-th{background-position:-240px 0;}
+.icon-th-list{background-position:-264px 0;}
+.icon-ok{background-position:-288px 0;}
+.icon-remove{background-position:-312px 0;}
+.icon-zoom-in{background-position:-336px 0;}
+.icon-zoom-out{background-position:-360px 0;}
+.icon-off{background-position:-384px 0;}
+.icon-signal{background-position:-408px 0;}
+.icon-cog{background-position:-432px 0;}
+.icon-trash{background-position:-456px 0;}
+.icon-home{background-position:0 -24px;}
+.icon-file{background-position:-24px -24px;}
+.icon-time{background-position:-48px -24px;}
+.icon-road{background-position:-72px -24px;}
+.icon-download-alt{background-position:-96px -24px;}
+.icon-download{background-position:-120px -24px;}
+.icon-upload{background-position:-144px -24px;}
+.icon-inbox{background-position:-168px -24px;}
+.icon-play-circle{background-position:-192px -24px;}
+.icon-repeat{background-position:-216px -24px;}
+.icon-refresh{background-position:-240px -24px;}
+.icon-list-alt{background-position:-264px -24px;}
+.icon-lock{background-position:-287px -24px;}
+.icon-flag{background-position:-312px -24px;}
+.icon-headphones{background-position:-336px -24px;}
+.icon-volume-off{background-position:-360px -24px;}
+.icon-volume-down{background-position:-384px -24px;}
+.icon-volume-up{background-position:-408px -24px;}
+.icon-qrcode{background-position:-432px -24px;}
+.icon-barcode{background-position:-456px -24px;}
+.icon-tag{background-position:0 -48px;}
+.icon-tags{background-position:-25px -48px;}
+.icon-book{background-position:-48px -48px;}
+.icon-bookmark{background-position:-72px -48px;}
+.icon-print{background-position:-96px -48px;}
+.icon-camera{background-position:-120px -48px;}
+.icon-font{background-position:-144px -48px;}
+.icon-bold{background-position:-167px -48px;}
+.icon-italic{background-position:-192px -48px;}
+.icon-text-height{background-position:-216px -48px;}
+.icon-text-width{background-position:-240px -48px;}
+.icon-align-left{background-position:-264px -48px;}
+.icon-align-center{background-position:-288px -48px;}
+.icon-align-right{background-position:-312px -48px;}
+.icon-align-justify{background-position:-336px -48px;}
+.icon-list{background-position:-360px -48px;}
+.icon-indent-left{background-position:-384px -48px;}
+.icon-indent-right{background-position:-408px -48px;}
+.icon-facetime-video{background-position:-432px -48px;}
+.icon-picture{background-position:-456px -48px;}
+.icon-pencil{background-position:0 -72px;}
+.icon-map-marker{background-position:-24px -72px;}
+.icon-adjust{background-position:-48px -72px;}
+.icon-tint{background-position:-72px -72px;}
+.icon-edit{background-position:-96px -72px;}
+.icon-share{background-position:-120px -72px;}
+.icon-check{background-position:-144px -72px;}
+.icon-move{background-position:-168px -72px;}
+.icon-step-backward{background-position:-192px -72px;}
+.icon-fast-backward{background-position:-216px -72px;}
+.icon-backward{background-position:-240px -72px;}
+.icon-play{background-position:-264px -72px;}
+.icon-pause{background-position:-288px -72px;}
+.icon-stop{background-position:-312px -72px;}
+.icon-forward{background-position:-336px -72px;}
+.icon-fast-forward{background-position:-360px -72px;}
+.icon-step-forward{background-position:-384px -72px;}
+.icon-eject{background-position:-408px -72px;}
+.icon-chevron-left{background-position:-432px -72px;}
+.icon-chevron-right{background-position:-456px -72px;}
+.icon-plus-sign{background-position:0 -96px;}
+.icon-minus-sign{background-position:-24px -96px;}
+.icon-remove-sign{background-position:-48px -96px;}
+.icon-ok-sign{background-position:-72px -96px;}
+.icon-question-sign{background-position:-96px -96px;}
+.icon-info-sign{background-position:-120px -96px;}
+.icon-screenshot{background-position:-144px -96px;}
+.icon-remove-circle{background-position:-168px -96px;}
+.icon-ok-circle{background-position:-192px -96px;}
+.icon-ban-circle{background-position:-216px -96px;}
+.icon-arrow-left{background-position:-240px -96px;}
+.icon-arrow-right{background-position:-264px -96px;}
+.icon-arrow-up{background-position:-289px -96px;}
+.icon-arrow-down{background-position:-312px -96px;}
+.icon-share-alt{background-position:-336px -96px;}
+.icon-resize-full{background-position:-360px -96px;}
+.icon-resize-small{background-position:-384px -96px;}
+.icon-plus{background-position:-408px -96px;}
+.icon-minus{background-position:-433px -96px;}
+.icon-asterisk{background-position:-456px -96px;}
+.icon-exclamation-sign{background-position:0 -120px;}
+.icon-gift{background-position:-24px -120px;}
+.icon-leaf{background-position:-48px -120px;}
+.icon-fire{background-position:-72px -120px;}
+.icon-eye-open{background-position:-96px -120px;}
+.icon-eye-close{background-position:-120px -120px;}
+.icon-warning-sign{background-position:-144px -120px;}
+.icon-plane{background-position:-168px -120px;}
+.icon-calendar{background-position:-192px -120px;}
+.icon-random{background-position:-216px -120px;width:16px;}
+.icon-comment{background-position:-240px -120px;}
+.icon-magnet{background-position:-264px -120px;}
+.icon-chevron-up{background-position:-288px -120px;}
+.icon-chevron-down{background-position:-313px -119px;}
+.icon-retweet{background-position:-336px -120px;}
+.icon-shopping-cart{background-position:-360px -120px;}
+.icon-folder-close{background-position:-384px -120px;}
+.icon-folder-open{background-position:-408px -120px;width:16px;}
+.icon-resize-vertical{background-position:-432px -119px;}
+.icon-resize-horizontal{background-position:-456px -118px;}
+.icon-hdd{background-position:0 -144px;}
+.icon-bullhorn{background-position:-24px -144px;}
+.icon-bell{background-position:-48px -144px;}
+.icon-certificate{background-position:-72px -144px;}
+.icon-thumbs-up{background-position:-96px -144px;}
+.icon-thumbs-down{background-position:-120px -144px;}
+.icon-hand-right{background-position:-144px -144px;}
+.icon-hand-left{background-position:-168px -144px;}
+.icon-hand-up{background-position:-192px -144px;}
+.icon-hand-down{background-position:-216px -144px;}
+.icon-circle-arrow-right{background-position:-240px -144px;}
+.icon-circle-arrow-left{background-position:-264px -144px;}
+.icon-circle-arrow-up{background-position:-288px -144px;}
+.icon-circle-arrow-down{background-position:-312px -144px;}
+.icon-globe{background-position:-336px -144px;}
+.icon-wrench{background-position:-360px -144px;}
+.icon-tasks{background-position:-384px -144px;}
+.icon-filter{background-position:-408px -144px;}
+.icon-briefcase{background-position:-432px -144px;}
+.icon-fullscreen{background-position:-456px -144px;}
+.btn-group{position:relative;display:inline-block;*display:inline;*zoom:1;font-size:0;vertical-align:middle;white-space:nowrap;*margin-left:.3em;}.btn-group:first-child{*margin-left:0;}
+.btn-group+.btn-group{margin-left:5px;}
+.btn-toolbar{font-size:0;margin-top:10px;margin-bottom:10px;}.btn-toolbar>.btn+.btn,.btn-toolbar>.btn-group+.btn,.btn-toolbar>.btn+.btn-group{margin-left:5px;}
+.btn-group>.btn{position:relative;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.btn-group>.btn+.btn{margin-left:-1px;}
+.btn-group>.btn,.btn-group>.dropdown-menu,.btn-group>.popover{font-size:14px;}
+.btn-group>.btn-mini{font-size:10.5px;}
+.btn-group>.btn-small{font-size:11.9px;}
+.btn-group>.btn-large{font-size:17.5px;}
+.btn-group>.btn:first-child{margin-left:0;-webkit-border-top-left-radius:4px;-moz-border-radius-topleft:4px;border-top-left-radius:4px;-webkit-border-bottom-left-radius:4px;-moz-border-radius-bottomleft:4px;border-bottom-left-radius:4px;}
+.btn-group>.btn:last-child,.btn-group>.dropdown-toggle{-webkit-border-top-right-radius:4px;-moz-border-radius-topright:4px;border-top-right-radius:4px;-webkit-border-bottom-right-radius:4px;-moz-border-radius-bottomright:4px;border-bottom-right-radius:4px;}
+.btn-group>.btn.large:first-child{margin-left:0;-webkit-border-top-left-radius:6px;-moz-border-radius-topleft:6px;border-top-left-radius:6px;-webkit-border-bottom-left-radius:6px;-moz-border-radius-bottomleft:6px;border-bottom-left-radius:6px;}
+.btn-group>.btn.large:last-child,.btn-group>.large.dropdown-toggle{-webkit-border-top-right-radius:6px;-moz-border-radius-topright:6px;border-top-right-radius:6px;-webkit-border-bottom-right-radius:6px;-moz-border-radius-bottomright:6px;border-bottom-right-radius:6px;}
+.btn-group>.btn:hover,.btn-group>.btn:focus,.btn-group>.btn:active,.btn-group>.btn.active{z-index:2;}
+.btn-group .dropdown-toggle:active,.btn-group.open .dropdown-toggle{outline:0;}
+.btn-group>.btn+.dropdown-toggle{padding-left:8px;padding-right:8px;-webkit-box-shadow:inset 1px 0 0 rgba(255,255,255,.125), inset 0 1px 0 rgba(255,255,255,.2), 0 1px 2px rgba(0,0,0,.05);-moz-box-shadow:inset 1px 0 0 rgba(255,255,255,.125), inset 0 1px 0 rgba(255,255,255,.2), 0 1px 2px rgba(0,0,0,.05);box-shadow:inset 1px 0 0 rgba(255,255,255,.125), inset 0 1px 0 rgba(255,255,255,.2), 0 1px 2px rgba(0,0,0,.05);*padding-top:5px;*padding-bottom:5px;}
+.btn-group>.btn-mini+.dropdown-toggle{padding-left:5px;padding-right:5px;*padding-top:2px;*padding-bottom:2px;}
+.btn-group>.btn-small+.dropdown-toggle{*padding-top:5px;*padding-bottom:4px;}
+.btn-group>.btn-large+.dropdown-toggle{padding-left:12px;padding-right:12px;*padding-top:7px;*padding-bottom:7px;}
+.btn-group.open .dropdown-toggle{background-image:none;-webkit-box-shadow:inset 0 2px 4px rgba(0,0,0,.15), 0 1px 2px rgba(0,0,0,.05);-moz-box-shadow:inset 0 2px 4px rgba(0,0,0,.15), 0 1px 2px rgba(0,0,0,.05);box-shadow:inset 0 2px 4px rgba(0,0,0,.15), 0 1px 2px rgba(0,0,0,.05);}
+.btn-group.open .btn.dropdown-toggle{background-color:#e6e6e6;}
+.btn-group.open .btn-primary.dropdown-toggle{background-color:#0044cc;}
+.btn-group.open .btn-warning.dropdown-toggle{background-color:#f89406;}
+.btn-group.open .btn-danger.dropdown-toggle{background-color:#bd362f;}
+.btn-group.open .btn-success.dropdown-toggle{background-color:#51a351;}
+.btn-group.open .btn-info.dropdown-toggle{background-color:#2f96b4;}
+.btn-group.open .btn-inverse.dropdown-toggle{background-color:#222222;}
+.btn .caret{margin-top:8px;margin-left:0;}
+.btn-mini .caret,.btn-small .caret,.btn-large .caret{margin-top:6px;}
+.btn-large .caret{border-left-width:5px;border-right-width:5px;border-top-width:5px;}
+.dropup .btn-large .caret{border-bottom-width:5px;}
+.btn-primary .caret,.btn-warning .caret,.btn-danger .caret,.btn-info .caret,.btn-success .caret,.btn-inverse .caret{border-top-color:#ffffff;border-bottom-color:#ffffff;}
+.btn-group-vertical{display:inline-block;*display:inline;*zoom:1;}
+.btn-group-vertical>.btn{display:block;float:none;max-width:100%;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.btn-group-vertical>.btn+.btn{margin-left:0;margin-top:-1px;}
+.btn-group-vertical>.btn:first-child{-webkit-border-radius:4px 4px 0 0;-moz-border-radius:4px 4px 0 0;border-radius:4px 4px 0 0;}
+.btn-group-vertical>.btn:last-child{-webkit-border-radius:0 0 4px 4px;-moz-border-radius:0 0 4px 4px;border-radius:0 0 4px 4px;}
+.btn-group-vertical>.btn-large:first-child{-webkit-border-radius:6px 6px 0 0;-moz-border-radius:6px 6px 0 0;border-radius:6px 6px 0 0;}
+.btn-group-vertical>.btn-large:last-child{-webkit-border-radius:0 0 6px 6px;-moz-border-radius:0 0 6px 6px;border-radius:0 0 6px 6px;}
+.nav{margin-left:0;margin-bottom:20px;list-style:none;}
+.nav>li>a{display:block;}
+.nav>li>a:hover{text-decoration:none;background-color:#eeeeee;}
+.nav>li>a>img{max-width:none;}
+.nav>.pull-right{float:right;}
+.nav-header{display:block;padding:3px 15px;font-size:11px;font-weight:bold;line-height:20px;color:#999999;text-shadow:0 1px 0 rgba(255, 255, 255, 0.5);text-transform:uppercase;}
+.nav li+.nav-header{margin-top:9px;}
+.nav-list{padding-left:15px;padding-right:15px;margin-bottom:0;}
+.nav-list>li>a,.nav-list .nav-header{margin-left:-15px;margin-right:-15px;text-shadow:0 1px 0 rgba(255, 255, 255, 0.5);}
+.nav-list>li>a{padding:3px 15px;}
+.nav-list>.active>a,.nav-list>.active>a:hover{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.2);background-color:#0088cc;}
+.nav-list [class^="icon-"],.nav-list [class*=" icon-"]{margin-right:2px;}
+.nav-list .divider{*width:100%;height:1px;margin:9px 1px;*margin:-5px 0 5px;overflow:hidden;background-color:#e5e5e5;border-bottom:1px solid #ffffff;}
+.nav-tabs,.nav-pills{*zoom:1;}.nav-tabs:before,.nav-pills:before,.nav-tabs:after,.nav-pills:after{display:table;content:"";line-height:0;}
+.nav-tabs:after,.nav-pills:after{clear:both;}
+.nav-tabs>li,.nav-pills>li{float:left;}
+.nav-tabs>li>a,.nav-pills>li>a{padding-right:12px;padding-left:12px;margin-right:2px;line-height:14px;}
+.nav-tabs{border-bottom:1px solid #ddd;}
+.nav-tabs>li{margin-bottom:-1px;}
+.nav-tabs>li>a{padding-top:8px;padding-bottom:8px;line-height:20px;border:1px solid transparent;-webkit-border-radius:4px 4px 0 0;-moz-border-radius:4px 4px 0 0;border-radius:4px 4px 0 0;}.nav-tabs>li>a:hover{border-color:#eeeeee #eeeeee #dddddd;}
+.nav-tabs>.active>a,.nav-tabs>.active>a:hover{color:#555555;background-color:#ffffff;border:1px solid #ddd;border-bottom-color:transparent;cursor:default;}
+.nav-pills>li>a{padding-top:8px;padding-bottom:8px;margin-top:2px;margin-bottom:2px;-webkit-border-radius:5px;-moz-border-radius:5px;border-radius:5px;}
+.nav-pills>.active>a,.nav-pills>.active>a:hover{color:#ffffff;background-color:#0088cc;}
+.nav-stacked>li{float:none;}
+.nav-stacked>li>a{margin-right:0;}
+.nav-tabs.nav-stacked{border-bottom:0;}
+.nav-tabs.nav-stacked>li>a{border:1px solid #ddd;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.nav-tabs.nav-stacked>li:first-child>a{-webkit-border-top-right-radius:4px;-moz-border-radius-topright:4px;border-top-right-radius:4px;-webkit-border-top-left-radius:4px;-moz-border-radius-topleft:4px;border-top-left-radius:4px;}
+.nav-tabs.nav-stacked>li:last-child>a{-webkit-border-bottom-right-radius:4px;-moz-border-radius-bottomright:4px;border-bottom-right-radius:4px;-webkit-border-bottom-left-radius:4px;-moz-border-radius-bottomleft:4px;border-bottom-left-radius:4px;}
+.nav-tabs.nav-stacked>li>a:hover{border-color:#ddd;z-index:2;}
+.nav-pills.nav-stacked>li>a{margin-bottom:3px;}
+.nav-pills.nav-stacked>li:last-child>a{margin-bottom:1px;}
+.nav-tabs .dropdown-menu{-webkit-border-radius:0 0 6px 6px;-moz-border-radius:0 0 6px 6px;border-radius:0 0 6px 6px;}
+.nav-pills .dropdown-menu{-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;}
+.nav .dropdown-toggle .caret{border-top-color:#0088cc;border-bottom-color:#0088cc;margin-top:6px;}
+.nav .dropdown-toggle:hover .caret{border-top-color:#005580;border-bottom-color:#005580;}
+.nav-tabs .dropdown-toggle .caret{margin-top:8px;}
+.nav .active .dropdown-toggle .caret{border-top-color:#fff;border-bottom-color:#fff;}
+.nav-tabs .active .dropdown-toggle .caret{border-top-color:#555555;border-bottom-color:#555555;}
+.nav>.dropdown.active>a:hover{cursor:pointer;}
+.nav-tabs .open .dropdown-toggle,.nav-pills .open .dropdown-toggle,.nav>li.dropdown.open.active>a:hover{color:#ffffff;background-color:#999999;border-color:#999999;}
+.nav li.dropdown.open .caret,.nav li.dropdown.open.active .caret,.nav li.dropdown.open a:hover .caret{border-top-color:#ffffff;border-bottom-color:#ffffff;opacity:1;filter:alpha(opacity=100);}
+.tabs-stacked .open>a:hover{border-color:#999999;}
+.tabbable{*zoom:1;}.tabbable:before,.tabbable:after{display:table;content:"";line-height:0;}
+.tabbable:after{clear:both;}
+.tab-content{overflow:auto;}
+.tabs-below>.nav-tabs,.tabs-right>.nav-tabs,.tabs-left>.nav-tabs{border-bottom:0;}
+.tab-content>.tab-pane,.pill-content>.pill-pane{display:none;}
+.tab-content>.active,.pill-content>.active{display:block;}
+.tabs-below>.nav-tabs{border-top:1px solid #ddd;}
+.tabs-below>.nav-tabs>li{margin-top:-1px;margin-bottom:0;}
+.tabs-below>.nav-tabs>li>a{-webkit-border-radius:0 0 4px 4px;-moz-border-radius:0 0 4px 4px;border-radius:0 0 4px 4px;}.tabs-below>.nav-tabs>li>a:hover{border-bottom-color:transparent;border-top-color:#ddd;}
+.tabs-below>.nav-tabs>.active>a,.tabs-below>.nav-tabs>.active>a:hover{border-color:transparent #ddd #ddd #ddd;}
+.tabs-left>.nav-tabs>li,.tabs-right>.nav-tabs>li{float:none;}
+.tabs-left>.nav-tabs>li>a,.tabs-right>.nav-tabs>li>a{min-width:74px;margin-right:0;margin-bottom:3px;}
+.tabs-left>.nav-tabs{float:left;margin-right:19px;border-right:1px solid #ddd;}
+.tabs-left>.nav-tabs>li>a{margin-right:-1px;-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px;}
+.tabs-left>.nav-tabs>li>a:hover{border-color:#eeeeee #dddddd #eeeeee #eeeeee;}
+.tabs-left>.nav-tabs .active>a,.tabs-left>.nav-tabs .active>a:hover{border-color:#ddd transparent #ddd #ddd;*border-right-color:#ffffff;}
+.tabs-right>.nav-tabs{float:right;margin-left:19px;border-left:1px solid #ddd;}
+.tabs-right>.nav-tabs>li>a{margin-left:-1px;-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0;}
+.tabs-right>.nav-tabs>li>a:hover{border-color:#eeeeee #eeeeee #eeeeee #dddddd;}
+.tabs-right>.nav-tabs .active>a,.tabs-right>.nav-tabs .active>a:hover{border-color:#ddd #ddd #ddd transparent;*border-left-color:#ffffff;}
+.nav>.disabled>a{color:#999999;}
+.nav>.disabled>a:hover{text-decoration:none;background-color:transparent;cursor:default;}
+.navbar{overflow:visible;margin-bottom:20px;*position:relative;*z-index:2;}
+.navbar-inner{min-height:40px;padding-left:20px;padding-right:20px;background-color:#fafafa;background-image:-moz-linear-gradient(top, #ffffff, #f2f2f2);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#ffffff), to(#f2f2f2));background-image:-webkit-linear-gradient(top, #ffffff, #f2f2f2);background-image:-o-linear-gradient(top, #ffffff, #f2f2f2);background-image:linear-gradient(to bottom, #ffffff, #f2f2f2);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gra [...]
+.navbar-inner:after{clear:both;}
+.navbar .container{width:auto;}
+.nav-collapse.collapse{height:auto;overflow:visible;}
+.navbar .brand{float:left;display:block;padding:10px 20px 10px;margin-left:-20px;font-size:20px;font-weight:200;color:#777777;text-shadow:0 1px 0 #ffffff;}.navbar .brand:hover{text-decoration:none;}
+.navbar-text{margin-bottom:0;line-height:40px;color:#777777;}
+.navbar-link{color:#777777;}.navbar-link:hover{color:#333333;}
+.navbar .divider-vertical{height:40px;margin:0 9px;border-left:1px solid #f2f2f2;border-right:1px solid #ffffff;}
+.navbar .btn,.navbar .btn-group{margin-top:5px;}
+.navbar .btn-group .btn,.navbar .input-prepend .btn,.navbar .input-append .btn{margin-top:0;}
+.navbar-form{margin-bottom:0;*zoom:1;}.navbar-form:before,.navbar-form:after{display:table;content:"";line-height:0;}
+.navbar-form:after{clear:both;}
+.navbar-form input,.navbar-form select,.navbar-form .radio,.navbar-form .checkbox{margin-top:5px;}
+.navbar-form input,.navbar-form select,.navbar-form .btn{display:inline-block;margin-bottom:0;}
+.navbar-form input[type="image"],.navbar-form input[type="checkbox"],.navbar-form input[type="radio"]{margin-top:3px;}
+.navbar-form .input-append,.navbar-form .input-prepend{margin-top:5px;white-space:nowrap;}.navbar-form .input-append input,.navbar-form .input-prepend input{margin-top:0;}
+.navbar-search{position:relative;float:left;margin-top:5px;margin-bottom:0;}.navbar-search .search-query{margin-bottom:0;padding:4px 14px;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:13px;font-weight:normal;line-height:1;-webkit-border-radius:15px;-moz-border-radius:15px;border-radius:15px;}
+.navbar-static-top{position:static;margin-bottom:0;}.navbar-static-top .navbar-inner{-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.navbar-fixed-top,.navbar-fixed-bottom{position:fixed;right:0;left:0;z-index:1030;margin-bottom:0;}
+.navbar-fixed-top .navbar-inner,.navbar-static-top .navbar-inner{border-width:0 0 1px;}
+.navbar-fixed-bottom .navbar-inner{border-width:1px 0 0;}
+.navbar-fixed-top .navbar-inner,.navbar-fixed-bottom .navbar-inner{padding-left:0;padding-right:0;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;}
+.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:940px;}
+.navbar-fixed-top{top:0;}
+.navbar-fixed-top .navbar-inner,.navbar-static-top .navbar-inner{-webkit-box-shadow:0 1px 10px rgba(0,0,0,.1);-moz-box-shadow:0 1px 10px rgba(0,0,0,.1);box-shadow:0 1px 10px rgba(0,0,0,.1);}
+.navbar-fixed-bottom{bottom:0;}.navbar-fixed-bottom .navbar-inner{-webkit-box-shadow:0 -1px 10px rgba(0,0,0,.1);-moz-box-shadow:0 -1px 10px rgba(0,0,0,.1);box-shadow:0 -1px 10px rgba(0,0,0,.1);}
+.navbar .nav{position:relative;left:0;display:block;float:left;margin:0 10px 0 0;}
+.navbar .nav.pull-right{float:right;margin-right:0;}
+.navbar .nav>li{float:left;}
+.navbar .nav>li>a{float:none;padding:10px 15px 10px;color:#777777;text-decoration:none;text-shadow:0 1px 0 #ffffff;}
+.navbar .nav .dropdown-toggle .caret{margin-top:8px;}
+.navbar .nav>li>a:focus,.navbar .nav>li>a:hover{background-color:transparent;color:#333333;text-decoration:none;}
+.navbar .nav>.active>a,.navbar .nav>.active>a:hover,.navbar .nav>.active>a:focus{color:#555555;text-decoration:none;background-color:#e5e5e5;-webkit-box-shadow:inset 0 3px 8px rgba(0, 0, 0, 0.125);-moz-box-shadow:inset 0 3px 8px rgba(0, 0, 0, 0.125);box-shadow:inset 0 3px 8px rgba(0, 0, 0, 0.125);}
+.navbar .btn-navbar{display:none;float:right;padding:7px 10px;margin-left:5px;margin-right:5px;color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#ededed;background-image:-moz-linear-gradient(top, #f2f2f2, #e5e5e5);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#f2f2f2), to(#e5e5e5));background-image:-webkit-linear-gradient(top, #f2f2f2, #e5e5e5);background-image:-o-linear-gradient(top, #f2f2f2, #e5e5e5);background-image:linear-gradient(to bottom, #f2f2f [...]
+.navbar .btn-navbar:active,.navbar .btn-navbar.active{background-color:#cccccc \9;}
+.navbar .btn-navbar .icon-bar{display:block;width:18px;height:2px;background-color:#f5f5f5;-webkit-border-radius:1px;-moz-border-radius:1px;border-radius:1px;-webkit-box-shadow:0 1px 0 rgba(0, 0, 0, 0.25);-moz-box-shadow:0 1px 0 rgba(0, 0, 0, 0.25);box-shadow:0 1px 0 rgba(0, 0, 0, 0.25);}
+.btn-navbar .icon-bar+.icon-bar{margin-top:3px;}
+.navbar .nav>li>.dropdown-menu:before{content:'';display:inline-block;border-left:7px solid transparent;border-right:7px solid transparent;border-bottom:7px solid #ccc;border-bottom-color:rgba(0, 0, 0, 0.2);position:absolute;top:-7px;left:9px;}
+.navbar .nav>li>.dropdown-menu:after{content:'';display:inline-block;border-left:6px solid transparent;border-right:6px solid transparent;border-bottom:6px solid #ffffff;position:absolute;top:-6px;left:10px;}
+.navbar-fixed-bottom .nav>li>.dropdown-menu:before{border-top:7px solid #ccc;border-top-color:rgba(0, 0, 0, 0.2);border-bottom:0;bottom:-7px;top:auto;}
+.navbar-fixed-bottom .nav>li>.dropdown-menu:after{border-top:6px solid #ffffff;border-bottom:0;bottom:-6px;top:auto;}
+.navbar .nav li.dropdown>a:hover .caret{border-top-color:#555555;border-bottom-color:#555555;}
+.navbar .nav li.dropdown.open>.dropdown-toggle,.navbar .nav li.dropdown.active>.dropdown-toggle,.navbar .nav li.dropdown.open.active>.dropdown-toggle{background-color:#e5e5e5;color:#555555;}
+.navbar .nav li.dropdown>.dropdown-toggle .caret{border-top-color:#777777;border-bottom-color:#777777;}
+.navbar .nav li.dropdown.open>.dropdown-toggle .caret,.navbar .nav li.dropdown.active>.dropdown-toggle .caret,.navbar .nav li.dropdown.open.active>.dropdown-toggle .caret{border-top-color:#555555;border-bottom-color:#555555;}
+.navbar .pull-right>li>.dropdown-menu,.navbar .nav>li>.dropdown-menu.pull-right{left:auto;right:0;}.navbar .pull-right>li>.dropdown-menu:before,.navbar .nav>li>.dropdown-menu.pull-right:before{left:auto;right:12px;}
+.navbar .pull-right>li>.dropdown-menu:after,.navbar .nav>li>.dropdown-menu.pull-right:after{left:auto;right:13px;}
+.navbar .pull-right>li>.dropdown-menu .dropdown-menu,.navbar .nav>li>.dropdown-menu.pull-right .dropdown-menu{left:auto;right:100%;margin-left:0;margin-right:-1px;-webkit-border-radius:6px 0 6px 6px;-moz-border-radius:6px 0 6px 6px;border-radius:6px 0 6px 6px;}
+.navbar-inverse .navbar-inner{background-color:#1b1b1b;background-image:-moz-linear-gradient(top, #222222, #111111);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#222222), to(#111111));background-image:-webkit-linear-gradient(top, #222222, #111111);background-image:-o-linear-gradient(top, #222222, #111111);background-image:linear-gradient(to bottom, #222222, #111111);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff222222', endC [...]
+.navbar-inverse .brand,.navbar-inverse .nav>li>a{color:#999999;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);}.navbar-inverse .brand:hover,.navbar-inverse .nav>li>a:hover{color:#ffffff;}
+.navbar-inverse .brand{color:#999999;}
+.navbar-inverse .navbar-text{color:#999999;}
+.navbar-inverse .nav>li>a:focus,.navbar-inverse .nav>li>a:hover{background-color:transparent;color:#ffffff;}
+.navbar-inverse .nav .active>a,.navbar-inverse .nav .active>a:hover,.navbar-inverse .nav .active>a:focus{color:#ffffff;background-color:#111111;}
+.navbar-inverse .navbar-link{color:#999999;}.navbar-inverse .navbar-link:hover{color:#ffffff;}
+.navbar-inverse .divider-vertical{border-left-color:#111111;border-right-color:#222222;}
+.navbar-inverse .nav li.dropdown.open>.dropdown-toggle,.navbar-inverse .nav li.dropdown.active>.dropdown-toggle,.navbar-inverse .nav li.dropdown.open.active>.dropdown-toggle{background-color:#111111;color:#ffffff;}
+.navbar-inverse .nav li.dropdown>a:hover .caret{border-top-color:#ffffff;border-bottom-color:#ffffff;}
+.navbar-inverse .nav li.dropdown>.dropdown-toggle .caret{border-top-color:#999999;border-bottom-color:#999999;}
+.navbar-inverse .nav li.dropdown.open>.dropdown-toggle .caret,.navbar-inverse .nav li.dropdown.active>.dropdown-toggle .caret,.navbar-inverse .nav li.dropdown.open.active>.dropdown-toggle .caret{border-top-color:#ffffff;border-bottom-color:#ffffff;}
+.navbar-inverse .navbar-search .search-query{color:#ffffff;background-color:#515151;border-color:#111111;-webkit-box-shadow:inset 0 1px 2px rgba(0,0,0,.1), 0 1px 0 rgba(255,255,255,.15);-moz-box-shadow:inset 0 1px 2px rgba(0,0,0,.1), 0 1px 0 rgba(255,255,255,.15);box-shadow:inset 0 1px 2px rgba(0,0,0,.1), 0 1px 0 rgba(255,255,255,.15);-webkit-transition:none;-moz-transition:none;-o-transition:none;transition:none;}.navbar-inverse .navbar-search .search-query:-moz-placeholder{color:#cccccc;}
+.navbar-inverse .navbar-search .search-query:-ms-input-placeholder{color:#cccccc;}
+.navbar-inverse .navbar-search .search-query::-webkit-input-placeholder{color:#cccccc;}
+.navbar-inverse .navbar-search .search-query:focus,.navbar-inverse .navbar-search .search-query.focused{padding:5px 15px;color:#333333;text-shadow:0 1px 0 #ffffff;background-color:#ffffff;border:0;-webkit-box-shadow:0 0 3px rgba(0, 0, 0, 0.15);-moz-box-shadow:0 0 3px rgba(0, 0, 0, 0.15);box-shadow:0 0 3px rgba(0, 0, 0, 0.15);outline:0;}
+.navbar-inverse .btn-navbar{color:#ffffff;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#0e0e0e;background-image:-moz-linear-gradient(top, #151515, #040404);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#151515), to(#040404));background-image:-webkit-linear-gradient(top, #151515, #040404);background-image:-o-linear-gradient(top, #151515, #040404);background-image:linear-gradient(to bottom, #151515, #040404);background-repeat:repeat-x;filter:progid:DXImageTransfo [...]
+.navbar-inverse .btn-navbar:active,.navbar-inverse .btn-navbar.active{background-color:#000000 \9;}
+.breadcrumb{padding:8px 15px;margin:0 0 20px;list-style:none;background-color:#f5f5f5;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;}.breadcrumb>li{display:inline-block;*display:inline;*zoom:1;text-shadow:0 1px 0 #ffffff;}.breadcrumb>li>.divider{padding:0 5px;color:#ccc;}
+.breadcrumb>.active{color:#999999;}
+.pagination{margin:20px 0;}
+.pagination ul{display:inline-block;*display:inline;*zoom:1;margin-left:0;margin-bottom:0;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:0 1px 2px rgba(0, 0, 0, 0.05);-moz-box-shadow:0 1px 2px rgba(0, 0, 0, 0.05);box-shadow:0 1px 2px rgba(0, 0, 0, 0.05);}
+.pagination ul>li{display:inline;}
+.pagination ul>li>a,.pagination ul>li>span{float:left;padding:4px 12px;line-height:20px;text-decoration:none;background-color:#ffffff;border:1px solid #dddddd;border-left-width:0;}
+.pagination ul>li>a:hover,.pagination ul>.active>a,.pagination ul>.active>span{background-color:#f5f5f5;}
+.pagination ul>.active>a,.pagination ul>.active>span{color:#999999;cursor:default;}
+.pagination ul>.disabled>span,.pagination ul>.disabled>a,.pagination ul>.disabled>a:hover{color:#999999;background-color:transparent;cursor:default;}
+.pagination ul>li:first-child>a,.pagination ul>li:first-child>span{border-left-width:1px;-webkit-border-top-left-radius:4px;-moz-border-radius-topleft:4px;border-top-left-radius:4px;-webkit-border-bottom-left-radius:4px;-moz-border-radius-bottomleft:4px;border-bottom-left-radius:4px;}
+.pagination ul>li:last-child>a,.pagination ul>li:last-child>span{-webkit-border-top-right-radius:4px;-moz-border-radius-topright:4px;border-top-right-radius:4px;-webkit-border-bottom-right-radius:4px;-moz-border-radius-bottomright:4px;border-bottom-right-radius:4px;}
+.pagination-centered{text-align:center;}
+.pagination-right{text-align:right;}
+.pagination-large ul>li>a,.pagination-large ul>li>span{padding:11px 19px;font-size:17.5px;}
+.pagination-large ul>li:first-child>a,.pagination-large ul>li:first-child>span{-webkit-border-top-left-radius:6px;-moz-border-radius-topleft:6px;border-top-left-radius:6px;-webkit-border-bottom-left-radius:6px;-moz-border-radius-bottomleft:6px;border-bottom-left-radius:6px;}
+.pagination-large ul>li:last-child>a,.pagination-large ul>li:last-child>span{-webkit-border-top-right-radius:6px;-moz-border-radius-topright:6px;border-top-right-radius:6px;-webkit-border-bottom-right-radius:6px;-moz-border-radius-bottomright:6px;border-bottom-right-radius:6px;}
+.pagination-mini ul>li:first-child>a,.pagination-small ul>li:first-child>a,.pagination-mini ul>li:first-child>span,.pagination-small ul>li:first-child>span{-webkit-border-top-left-radius:3px;-moz-border-radius-topleft:3px;border-top-left-radius:3px;-webkit-border-bottom-left-radius:3px;-moz-border-radius-bottomleft:3px;border-bottom-left-radius:3px;}
+.pagination-mini ul>li:last-child>a,.pagination-small ul>li:last-child>a,.pagination-mini ul>li:last-child>span,.pagination-small ul>li:last-child>span{-webkit-border-top-right-radius:3px;-moz-border-radius-topright:3px;border-top-right-radius:3px;-webkit-border-bottom-right-radius:3px;-moz-border-radius-bottomright:3px;border-bottom-right-radius:3px;}
+.pagination-small ul>li>a,.pagination-small ul>li>span{padding:2px 10px;font-size:11.9px;}
+.pagination-mini ul>li>a,.pagination-mini ul>li>span{padding:0 6px;font-size:10.5px;}
+.pager{margin:20px 0;list-style:none;text-align:center;*zoom:1;}.pager:before,.pager:after{display:table;content:"";line-height:0;}
+.pager:after{clear:both;}
+.pager li{display:inline;}
+.pager li>a,.pager li>span{display:inline-block;padding:5px 14px;background-color:#fff;border:1px solid #ddd;-webkit-border-radius:15px;-moz-border-radius:15px;border-radius:15px;}
+.pager li>a:hover{text-decoration:none;background-color:#f5f5f5;}
+.pager .next>a,.pager .next>span{float:right;}
+.pager .previous>a,.pager .previous>span{float:left;}
+.pager .disabled>a,.pager .disabled>a:hover,.pager .disabled>span{color:#999999;background-color:#fff;cursor:default;}
+.thumbnails{margin-left:-20px;list-style:none;*zoom:1;}.thumbnails:before,.thumbnails:after{display:table;content:"";line-height:0;}
+.thumbnails:after{clear:both;}
+.row-fluid .thumbnails{margin-left:0;}
+.thumbnails>li{float:left;margin-bottom:20px;margin-left:20px;}
+.thumbnail{display:block;padding:4px;line-height:20px;border:1px solid #ddd;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:0 1px 3px rgba(0, 0, 0, 0.055);-moz-box-shadow:0 1px 3px rgba(0, 0, 0, 0.055);box-shadow:0 1px 3px rgba(0, 0, 0, 0.055);-webkit-transition:all 0.2s ease-in-out;-moz-transition:all 0.2s ease-in-out;-o-transition:all 0.2s ease-in-out;transition:all 0.2s ease-in-out;}
+a.thumbnail:hover{border-color:#0088cc;-webkit-box-shadow:0 1px 4px rgba(0, 105, 214, 0.25);-moz-box-shadow:0 1px 4px rgba(0, 105, 214, 0.25);box-shadow:0 1px 4px rgba(0, 105, 214, 0.25);}
+.thumbnail>img{display:block;max-width:100%;margin-left:auto;margin-right:auto;}
+.thumbnail .caption{padding:9px;color:#555555;}
+.alert{padding:8px 35px 8px 14px;margin-bottom:20px;text-shadow:0 1px 0 rgba(255, 255, 255, 0.5);background-color:#fcf8e3;border:1px solid #fbeed5;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;}
+.alert,.alert h4{color:#c09853;}
+.alert h4{margin:0;}
+.alert .close{position:relative;top:-2px;right:-21px;line-height:20px;}
+.alert-success{background-color:#dff0d8;border-color:#d6e9c6;color:#468847;}
+.alert-success h4{color:#468847;}
+.alert-danger,.alert-error{background-color:#f2dede;border-color:#eed3d7;color:#b94a48;}
+.alert-danger h4,.alert-error h4{color:#b94a48;}
+.alert-info{background-color:#d9edf7;border-color:#bce8f1;color:#3a87ad;}
+.alert-info h4{color:#3a87ad;}
+.alert-block{padding-top:14px;padding-bottom:14px;}
+.alert-block>p,.alert-block>ul{margin-bottom:0;}
+.alert-block p+p{margin-top:5px;}
+@-webkit-keyframes progress-bar-stripes{from{background-position:40px 0;} to{background-position:0 0;}}@-moz-keyframes progress-bar-stripes{from{background-position:40px 0;} to{background-position:0 0;}}@-ms-keyframes progress-bar-stripes{from{background-position:40px 0;} to{background-position:0 0;}}@-o-keyframes progress-bar-stripes{from{background-position:0 0;} to{background-position:40px 0;}}@keyframes progress-bar-stripes{from{background-position:40px 0;} to{background-position:0 0 [...]
+.progress .bar{width:0%;height:100%;color:#ffffff;float:left;font-size:12px;text-align:center;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);background-color:#0e90d2;background-image:-moz-linear-gradient(top, #149bdf, #0480be);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#149bdf), to(#0480be));background-image:-webkit-linear-gradient(top, #149bdf, #0480be);background-image:-o-linear-gradient(top, #149bdf, #0480be);background-image:linear-gradient(to bottom, #149bdf, #0480be);bac [...]
+.progress .bar+.bar{-webkit-box-shadow:inset 1px 0 0 rgba(0,0,0,.15), inset 0 -1px 0 rgba(0,0,0,.15);-moz-box-shadow:inset 1px 0 0 rgba(0,0,0,.15), inset 0 -1px 0 rgba(0,0,0,.15);box-shadow:inset 1px 0 0 rgba(0,0,0,.15), inset 0 -1px 0 rgba(0,0,0,.15);}
+.progress-striped .bar{background-color:#149bdf;background-image:-webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent));background-image:-webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, [...]
+.progress.active .bar{-webkit-animation:progress-bar-stripes 2s linear infinite;-moz-animation:progress-bar-stripes 2s linear infinite;-ms-animation:progress-bar-stripes 2s linear infinite;-o-animation:progress-bar-stripes 2s linear infinite;animation:progress-bar-stripes 2s linear infinite;}
+.progress-danger .bar,.progress .bar-danger{background-color:#dd514c;background-image:-moz-linear-gradient(top, #ee5f5b, #c43c35);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#ee5f5b), to(#c43c35));background-image:-webkit-linear-gradient(top, #ee5f5b, #c43c35);background-image:-o-linear-gradient(top, #ee5f5b, #c43c35);background-image:linear-gradient(to bottom, #ee5f5b, #c43c35);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#f [...]
+.progress-danger.progress-striped .bar,.progress-striped .bar-danger{background-color:#ee5f5b;background-image:-webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent));background-image:-webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, t [...]
+.progress-success .bar,.progress .bar-success{background-color:#5eb95e;background-image:-moz-linear-gradient(top, #62c462, #57a957);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#62c462), to(#57a957));background-image:-webkit-linear-gradient(top, #62c462, #57a957);background-image:-o-linear-gradient(top, #62c462, #57a957);background-image:linear-gradient(to bottom, #62c462, #57a957);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr=' [...]
+.progress-success.progress-striped .bar,.progress-striped .bar-success{background-color:#62c462;background-image:-webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent));background-image:-webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, [...]
+.progress-info .bar,.progress .bar-info{background-color:#4bb1cf;background-image:-moz-linear-gradient(top, #5bc0de, #339bb9);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#5bc0de), to(#339bb9));background-image:-webkit-linear-gradient(top, #5bc0de, #339bb9);background-image:-o-linear-gradient(top, #5bc0de, #339bb9);background-image:linear-gradient(to bottom, #5bc0de, #339bb9);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc [...]
+.progress-info.progress-striped .bar,.progress-striped .bar-info{background-color:#5bc0de;background-image:-webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent));background-image:-webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, trans [...]
+.progress-warning .bar,.progress .bar-warning{background-color:#faa732;background-image:-moz-linear-gradient(top, #fbb450, #f89406);background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#fbb450), to(#f89406));background-image:-webkit-linear-gradient(top, #fbb450, #f89406);background-image:-o-linear-gradient(top, #fbb450, #f89406);background-image:linear-gradient(to bottom, #fbb450, #f89406);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr=' [...]
+.progress-warning.progress-striped .bar,.progress-striped .bar-warning{background-color:#fbb450;background-image:-webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent));background-image:-webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, [...]
+.hero-unit{padding:60px;margin-bottom:30px;font-size:18px;font-weight:200;line-height:30px;color:inherit;background-color:#eeeeee;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;}.hero-unit h1{margin-bottom:0;font-size:60px;line-height:1;color:inherit;letter-spacing:-1px;}
+.hero-unit li{line-height:30px;}
+.media,.media-body{overflow:hidden;*overflow:visible;zoom:1;}
+.media,.media .media{margin-top:15px;}
+.media:first-child{margin-top:0;}
+.media-object{display:block;}
+.media-heading{margin:0 0 5px;}
+.media .pull-left{margin-right:10px;}
+.media .pull-right{margin-left:10px;}
+.media-list{margin-left:0;list-style:none;}
+.well{min-height:20px;padding:19px;margin-bottom:20px;background-color:#f5f5f5;border:1px solid #e3e3e3;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.05);-moz-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.05);box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.05);}.well blockquote{border-color:#ddd;border-color:rgba(0, 0, 0, 0.15);}
+.well-large{padding:24px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;}
+.well-small{padding:9px;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}
+.close{float:right;font-size:20px;font-weight:bold;line-height:20px;color:#000000;text-shadow:0 1px 0 #ffffff;opacity:0.2;filter:alpha(opacity=20);}.close:hover{color:#000000;text-decoration:none;cursor:pointer;opacity:0.4;filter:alpha(opacity=40);}
+button.close{padding:0;cursor:pointer;background:transparent;border:0;-webkit-appearance:none;}
+.pull-right{float:right;}
+.pull-left{float:left;}
+.hide{display:none;}
+.show{display:block;}
+.invisible{visibility:hidden;}
+.affix{position:fixed;}
+.fade{opacity:0;-webkit-transition:opacity 0.15s linear;-moz-transition:opacity 0.15s linear;-o-transition:opacity 0.15s linear;transition:opacity 0.15s linear;}.fade.in{opacity:1;}
+.collapse{position:relative;height:0;overflow:hidden;-webkit-transition:height 0.35s ease;-moz-transition:height 0.35s ease;-o-transition:height 0.35s ease;transition:height 0.35s ease;}.collapse.in{height:auto;}
+.hidden{display:none;visibility:hidden;}
+.visible-phone{display:none !important;}
+.visible-tablet{display:none !important;}
+.hidden-desktop{display:none !important;}
+.visible-desktop{display:inherit !important;}
+@media (min-width:768px) and (max-width:979px){.hidden-desktop{display:inherit !important;} .visible-desktop{display:none !important ;} .visible-tablet{display:inherit !important;} .hidden-tablet{display:none !important;}}@media (max-width:767px){.hidden-desktop{display:inherit !important;} .visible-desktop{display:none !important;} .visible-phone{display:inherit !important;} .hidden-phone{display:none !important;}}@media (max-width:767px){body{padding-left:20px;padding-right:20px;} .nav [...]
diff --git a/assets/themes/custom-twitter/bootstrap/img/glyphicons-halflings-white.png b/assets/themes/custom-twitter/bootstrap/img/glyphicons-halflings-white.png
new file mode 100644
index 0000000..3bf6484
Binary files /dev/null and b/assets/themes/custom-twitter/bootstrap/img/glyphicons-halflings-white.png differ
diff --git a/assets/themes/custom-twitter/bootstrap/img/glyphicons-halflings.png b/assets/themes/custom-twitter/bootstrap/img/glyphicons-halflings.png
new file mode 100644
index 0000000..a996999
Binary files /dev/null and b/assets/themes/custom-twitter/bootstrap/img/glyphicons-halflings.png differ
diff --git a/assets/themes/custom-twitter/css/1.4.0/bootstrap.css b/assets/themes/custom-twitter/css/1.4.0/bootstrap.css
new file mode 100644
index 0000000..8ff30dd
--- /dev/null
+++ b/assets/themes/custom-twitter/css/1.4.0/bootstrap.css
@@ -0,0 +1,356 @@
+html,body{margin:0;padding:0;}
+h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,cite,code,del,dfn,em,img,q,s,samp,small,strike,strong,sub,sup,tt,var,dd,dl,dt,li,ol,ul,fieldset,form,label,legend,button,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;font-weight:normal;font-style:normal;font-size:100%;line-height:1;font-family:inherit;}
+table{border-collapse:collapse;border-spacing:0;}
+ol,ul{list-style:none;}
+q:before,q:after,blockquote:before,blockquote:after{content:"";}
+html{overflow-y:scroll;font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;}
+a:focus{outline:thin dotted;}
+a:hover,a:active{outline:0;}
+article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block;}
+audio,canvas,video{display:inline-block;*display:inline;*zoom:1;}
+audio:not([controls]){display:none;}
+sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline;}
+sup{top:-0.5em;}
+sub{bottom:-0.25em;}
+img{border:0;-ms-interpolation-mode:bicubic;}
+button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;}
+button,input{line-height:normal;*overflow:visible;}
+button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0;}
+button,input[type="button"],input[type="reset"],input[type="submit"]{cursor:pointer;-webkit-appearance:button;}
+input[type="search"]{-webkit-appearance:textfield;-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;}
+input[type="search"]::-webkit-search-decoration{-webkit-appearance:none;}
+textarea{overflow:auto;vertical-align:top;}
+body{background-color:#ffffff;margin:0;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:13px;font-weight:normal;line-height:18px;color:#404040;}
+.container{width:940px;margin-left:auto;margin-right:auto;zoom:1;}.container:before,.container:after{display:table;content:"";zoom:1;}
+.container:after{clear:both;}
+.container-fluid{position:relative;min-width:940px;padding-left:20px;padding-right:20px;zoom:1;}.container-fluid:before,.container-fluid:after{display:table;content:"";zoom:1;}
+.container-fluid:after{clear:both;}
+.container-fluid>.sidebar{position:absolute;top:0;left:20px;width:220px;}
+.container-fluid>.content{margin-left:240px;}
+a{color:#0069d6;text-decoration:none;line-height:inherit;font-weight:inherit;}a:hover{color:#00438a;text-decoration:underline;}
+.pull-right{float:right;}
+.pull-left{float:left;}
+.hide{display:none;}
+.show{display:block;}
+.row{zoom:1;margin-left:-20px;}.row:before,.row:after{display:table;content:"";zoom:1;}
+.row:after{clear:both;}
+.row>[class*="span"]{display:inline;float:left;margin-left:20px;}
+.span1{width:40px;}
+.span2{width:100px;}
+.span3{width:160px;}
+.span4{width:220px;}
+.span5{width:280px;}
+.span6{width:340px;}
+.span7{width:400px;}
+.span8{width:460px;}
+.span9{width:520px;}
+.span10{width:580px;}
+.span11{width:640px;}
+.span12{width:700px;}
+.span13{width:760px;}
+.span14{width:820px;}
+.span15{width:880px;}
+.span16{width:940px;}
+.span17{width:1000px;}
+.span18{width:1060px;}
+.span19{width:1120px;}
+.span20{width:1180px;}
+.span21{width:1240px;}
+.span22{width:1300px;}
+.span23{width:1360px;}
+.span24{width:1420px;}
+.row>.offset1{margin-left:80px;}
+.row>.offset2{margin-left:140px;}
+.row>.offset3{margin-left:200px;}
+.row>.offset4{margin-left:260px;}
+.row>.offset5{margin-left:320px;}
+.row>.offset6{margin-left:380px;}
+.row>.offset7{margin-left:440px;}
+.row>.offset8{margin-left:500px;}
+.row>.offset9{margin-left:560px;}
+.row>.offset10{margin-left:620px;}
+.row>.offset11{margin-left:680px;}
+.row>.offset12{margin-left:740px;}
+.span-one-third{width:300px;}
+.span-two-thirds{width:620px;}
+.row>.offset-one-third{margin-left:340px;}
+.row>.offset-two-thirds{margin-left:660px;}
+p{font-size:13px;font-weight:normal;line-height:18px;margin-bottom:9px;}p small{font-size:11px;color:#bfbfbf;}
+h1,h2,h3,h4,h5,h6{font-weight:bold;color:#404040;}h1 small,h2 small,h3 small,h4 small,h5 small,h6 small{color:#bfbfbf;}
+h1{margin-bottom:18px;font-size:30px;line-height:36px;}h1 small{font-size:18px;}
+h2{font-size:24px;line-height:36px;}h2 small{font-size:14px;}
+h3,h4,h5,h6{line-height:36px;}
+h3{font-size:18px;}h3 small{font-size:14px;}
+h4{font-size:16px;}h4 small{font-size:12px;}
+h5{font-size:14px;}
+h6{font-size:13px;color:#bfbfbf;text-transform:uppercase;}
+ul,ol{margin:0 0 18px 25px;}
+ul ul,ul ol,ol ol,ol ul{margin-bottom:0;}
+ul{list-style:disc;}
+ol{list-style:decimal;}
+li{line-height:18px;}
+ul.unstyled{list-style:none;margin-left:0;}
+dl{margin-bottom:18px;}dl dt,dl dd{line-height:18px;}
+dl dt{font-weight:bold;}
+dl dd{margin-left:9px;}
+hr{margin:20px 0 19px;border:0;border-bottom:1px solid #eee;}
+strong{font-style:inherit;font-weight:bold;}
+em{font-style:italic;font-weight:inherit;line-height:inherit;}
+.muted{color:#bfbfbf;}
+blockquote{margin-bottom:18px;border-left:5px solid #eee;padding-left:15px;}blockquote p{font-size:14px;font-weight:300;line-height:18px;margin-bottom:0;}
+blockquote small{display:block;font-size:12px;font-weight:300;line-height:18px;color:#bfbfbf;}blockquote small:before{content:'\2014 \00A0';}
+address{display:block;line-height:18px;margin-bottom:18px;}
+code,pre{padding:0 3px 2px;font-family:Monaco, Andale Mono, Courier New, monospace;font-size:12px;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}
+code{color:rgba(0, 0, 0, 0.75);padding:1px 0px;}
+pre{background-color:#f5f5f5;display:block;padding:8.5px;margin:0 0 18px;line-height:18px;font-size:12px;border:1px solid #ccc;border:1px solid rgba(0, 0, 0, 0.15);-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;overflow:auto;}
+form{margin-bottom:18px;}
+fieldset{margin-bottom:18px;padding-top:18px;}fieldset legend{display:block;padding-left:150px;font-size:19.5px;line-height:1;color:#404040;*padding:0 0 5px 145px;*line-height:1.5;}
+form .clearfix{margin-bottom:18px;zoom:1;}form .clearfix:before,form .clearfix:after{display:table;content:"";zoom:1;}
+form .clearfix:after{clear:both;}
+label,input,select,textarea{font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:13px;font-weight:normal;line-height:normal;}
+label{padding-top:6px;font-size:13px;line-height:18px;float:left;width:130px;text-align:right;color:#404040;}
+form .input{margin-left:150px;}
+input[type=checkbox],input[type=radio]{cursor:pointer;}
+input,textarea,select,.uneditable-input{display:inline-block;width:210px;height:18px;padding:4px;font-size:13px;line-height:18px;color:#808080;border:1px solid #ccc;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}
+select{padding:initial;}
+input[type=checkbox],input[type=radio]{width:auto;height:auto;padding:0;margin:3px 0;*margin-top:0;line-height:normal;border:none;}
+input[type=file]{background-color:#ffffff;padding:initial;border:initial;line-height:initial;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;}
+input[type=button],input[type=reset],input[type=submit]{width:auto;height:auto;}
+select,input[type=file]{height:27px;*height:auto;line-height:27px;*margin-top:4px;}
+select[multiple]{height:inherit;background-color:#ffffff;}
+textarea{height:auto;}
+.uneditable-input{background-color:#ffffff;display:block;border-color:#eee;-webkit-box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.025);-moz-box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.025);box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.025);cursor:not-allowed;}
+:-moz-placeholder{color:#bfbfbf;}
+::-webkit-input-placeholder{color:#bfbfbf;}
+input,textarea{-webkit-transition:border linear 0.2s,box-shadow linear 0.2s;-moz-transition:border linear 0.2s,box-shadow linear 0.2s;-ms-transition:border linear 0.2s,box-shadow linear 0.2s;-o-transition:border linear 0.2s,box-shadow linear 0.2s;transition:border linear 0.2s,box-shadow linear 0.2s;-webkit-box-shadow:inset 0 1px 3px rgba(0, 0, 0, 0.1);-moz-box-shadow:inset 0 1px 3px rgba(0, 0, 0, 0.1);box-shadow:inset 0 1px 3px rgba(0, 0, 0, 0.1);}
+input:focus,textarea:focus{outline:0;border-color:rgba(82, 168, 236, 0.8);-webkit-box-shadow:inset 0 1px 3px rgba(0, 0, 0, 0.1),0 0 8px rgba(82, 168, 236, 0.6);-moz-box-shadow:inset 0 1px 3px rgba(0, 0, 0, 0.1),0 0 8px rgba(82, 168, 236, 0.6);box-shadow:inset 0 1px 3px rgba(0, 0, 0, 0.1),0 0 8px rgba(82, 168, 236, 0.6);}
+input[type=file]:focus,input[type=checkbox]:focus,select:focus{-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;outline:1px dotted #666;}
+form .clearfix.error>label,form .clearfix.error .help-block,form .clearfix.error .help-inline{color:#b94a48;}
+form .clearfix.error input,form .clearfix.error textarea{color:#b94a48;border-color:#ee5f5b;}form .clearfix.error input:focus,form .clearfix.error textarea:focus{border-color:#e9322d;-webkit-box-shadow:0 0 6px #f8b9b7;-moz-box-shadow:0 0 6px #f8b9b7;box-shadow:0 0 6px #f8b9b7;}
+form .clearfix.error .input-prepend .add-on,form .clearfix.error .input-append .add-on{color:#b94a48;background-color:#fce6e6;border-color:#b94a48;}
+form .clearfix.warning>label,form .clearfix.warning .help-block,form .clearfix.warning .help-inline{color:#c09853;}
+form .clearfix.warning input,form .clearfix.warning textarea{color:#c09853;border-color:#ccae64;}form .clearfix.warning input:focus,form .clearfix.warning textarea:focus{border-color:#be9a3f;-webkit-box-shadow:0 0 6px #e5d6b1;-moz-box-shadow:0 0 6px #e5d6b1;box-shadow:0 0 6px #e5d6b1;}
+form .clearfix.warning .input-prepend .add-on,form .clearfix.warning .input-append .add-on{color:#c09853;background-color:#d2b877;border-color:#c09853;}
+form .clearfix.success>label,form .clearfix.success .help-block,form .clearfix.success .help-inline{color:#468847;}
+form .clearfix.success input,form .clearfix.success textarea{color:#468847;border-color:#57a957;}form .clearfix.success input:focus,form .clearfix.success textarea:focus{border-color:#458845;-webkit-box-shadow:0 0 6px #9acc9a;-moz-box-shadow:0 0 6px #9acc9a;box-shadow:0 0 6px #9acc9a;}
+form .clearfix.success .input-prepend .add-on,form .clearfix.success .input-append .add-on{color:#468847;background-color:#bcddbc;border-color:#468847;}
+.input-mini,input.mini,textarea.mini,select.mini{width:60px;}
+.input-small,input.small,textarea.small,select.small{width:90px;}
+.input-medium,input.medium,textarea.medium,select.medium{width:150px;}
+.input-large,input.large,textarea.large,select.large{width:210px;}
+.input-xlarge,input.xlarge,textarea.xlarge,select.xlarge{width:270px;}
+.input-xxlarge,input.xxlarge,textarea.xxlarge,select.xxlarge{width:530px;}
+textarea.xxlarge{overflow-y:auto;}
+input.span1,textarea.span1{display:inline-block;float:none;width:30px;margin-left:0;}
+input.span2,textarea.span2{display:inline-block;float:none;width:90px;margin-left:0;}
+input.span3,textarea.span3{display:inline-block;float:none;width:150px;margin-left:0;}
+input.span4,textarea.span4{display:inline-block;float:none;width:210px;margin-left:0;}
+input.span5,textarea.span5{display:inline-block;float:none;width:270px;margin-left:0;}
+input.span6,textarea.span6{display:inline-block;float:none;width:330px;margin-left:0;}
+input.span7,textarea.span7{display:inline-block;float:none;width:390px;margin-left:0;}
+input.span8,textarea.span8{display:inline-block;float:none;width:450px;margin-left:0;}
+input.span9,textarea.span9{display:inline-block;float:none;width:510px;margin-left:0;}
+input.span10,textarea.span10{display:inline-block;float:none;width:570px;margin-left:0;}
+input.span11,textarea.span11{display:inline-block;float:none;width:630px;margin-left:0;}
+input.span12,textarea.span12{display:inline-block;float:none;width:690px;margin-left:0;}
+input.span13,textarea.span13{display:inline-block;float:none;width:750px;margin-left:0;}
+input.span14,textarea.span14{display:inline-block;float:none;width:810px;margin-left:0;}
+input.span15,textarea.span15{display:inline-block;float:none;width:870px;margin-left:0;}
+input.span16,textarea.span16{display:inline-block;float:none;width:930px;margin-left:0;}
+input[disabled],select[disabled],textarea[disabled],input[readonly],select[readonly],textarea[readonly]{background-color:#f5f5f5;border-color:#ddd;cursor:not-allowed;}
+.actions{background:#f5f5f5;margin-top:18px;margin-bottom:18px;padding:17px 20px 18px 150px;border-top:1px solid #ddd;-webkit-border-radius:0 0 3px 3px;-moz-border-radius:0 0 3px 3px;border-radius:0 0 3px 3px;}.actions .secondary-action{float:right;}.actions .secondary-action a{line-height:30px;}.actions .secondary-action a:hover{text-decoration:underline;}
+.help-inline,.help-block{font-size:13px;line-height:18px;color:#bfbfbf;}
+.help-inline{padding-left:5px;*position:relative;*top:-5px;}
+.help-block{display:block;max-width:600px;}
+.inline-inputs{color:#808080;}.inline-inputs span{padding:0 2px 0 1px;}
+.input-prepend input,.input-append input{-webkit-border-radius:0 3px 3px 0;-moz-border-radius:0 3px 3px 0;border-radius:0 3px 3px 0;}
+.input-prepend .add-on,.input-append .add-on{position:relative;background:#f5f5f5;border:1px solid #ccc;z-index:2;float:left;display:block;width:auto;min-width:16px;height:18px;padding:4px 4px 4px 5px;margin-right:-1px;font-weight:normal;line-height:18px;color:#bfbfbf;text-align:center;text-shadow:0 1px 0 #ffffff;-webkit-border-radius:3px 0 0 3px;-moz-border-radius:3px 0 0 3px;border-radius:3px 0 0 3px;}
+.input-prepend .active,.input-append .active{background:#a9dba9;border-color:#46a546;}
+.input-prepend .add-on{*margin-top:1px;}
+.input-append input{float:left;-webkit-border-radius:3px 0 0 3px;-moz-border-radius:3px 0 0 3px;border-radius:3px 0 0 3px;}
+.input-append .add-on{-webkit-border-radius:0 3px 3px 0;-moz-border-radius:0 3px 3px 0;border-radius:0 3px 3px 0;margin-right:0;margin-left:-1px;}
+.inputs-list{margin:0 0 5px;width:100%;}.inputs-list li{display:block;padding:0;width:100%;}
+.inputs-list label{display:block;float:none;width:auto;padding:0;margin-left:20px;line-height:18px;text-align:left;white-space:normal;}.inputs-list label strong{color:#808080;}
+.inputs-list label small{font-size:11px;font-weight:normal;}
+.inputs-list .inputs-list{margin-left:25px;margin-bottom:10px;padding-top:0;}
+.inputs-list:first-child{padding-top:6px;}
+.inputs-list li+li{padding-top:2px;}
+.inputs-list input[type=radio],.inputs-list input[type=checkbox]{margin-bottom:0;margin-left:-20px;float:left;}
+.form-stacked{padding-left:20px;}.form-stacked fieldset{padding-top:9px;}
+.form-stacked legend{padding-left:0;}
+.form-stacked label{display:block;float:none;width:auto;font-weight:bold;text-align:left;line-height:20px;padding-top:0;}
+.form-stacked .clearfix{margin-bottom:9px;}.form-stacked .clearfix div.input{margin-left:0;}
+.form-stacked .inputs-list{margin-bottom:0;}.form-stacked .inputs-list li{padding-top:0;}.form-stacked .inputs-list li label{font-weight:normal;padding-top:0;}
+.form-stacked div.clearfix.error{padding-top:10px;padding-bottom:10px;padding-left:10px;margin-top:0;margin-left:-10px;}
+.form-stacked .actions{margin-left:-20px;padding-left:20px;}
+table{width:100%;margin-bottom:18px;padding:0;font-size:13px;border-collapse:collapse;}table th,table td{padding:10px 10px 9px;line-height:18px;text-align:left;}
+table th{padding-top:9px;font-weight:bold;vertical-align:middle;}
+table td{vertical-align:top;border-top:1px solid #ddd;}
+table tbody th{border-top:1px solid #ddd;vertical-align:top;}
+.condensed-table th,.condensed-table td{padding:5px 5px 4px;}
+.bordered-table{border:1px solid #ddd;border-collapse:separate;*border-collapse:collapse;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;}.bordered-table th+th,.bordered-table td+td,.bordered-table th+td{border-left:1px solid #ddd;}
+.bordered-table thead tr:first-child th:first-child,.bordered-table tbody tr:first-child td:first-child{-webkit-border-radius:4px 0 0 0;-moz-border-radius:4px 0 0 0;border-radius:4px 0 0 0;}
+.bordered-table thead tr:first-child th:last-child,.bordered-table tbody tr:first-child td:last-child{-webkit-border-radius:0 4px 0 0;-moz-border-radius:0 4px 0 0;border-radius:0 4px 0 0;}
+.bordered-table tbody tr:last-child td:first-child{-webkit-border-radius:0 0 0 4px;-moz-border-radius:0 0 0 4px;border-radius:0 0 0 4px;}
+.bordered-table tbody tr:last-child td:last-child{-webkit-border-radius:0 0 4px 0;-moz-border-radius:0 0 4px 0;border-radius:0 0 4px 0;}
+table .span1{width:20px;}
+table .span2{width:60px;}
+table .span3{width:100px;}
+table .span4{width:140px;}
+table .span5{width:180px;}
+table .span6{width:220px;}
+table .span7{width:260px;}
+table .span8{width:300px;}
+table .span9{width:340px;}
+table .span10{width:380px;}
+table .span11{width:420px;}
+table .span12{width:460px;}
+table .span13{width:500px;}
+table .span14{width:540px;}
+table .span15{width:580px;}
+table .span16{width:620px;}
+.zebra-striped tbody tr:nth-child(odd) td,.zebra-striped tbody tr:nth-child(odd) th{background-color:#f9f9f9;}
+.zebra-striped tbody tr:hover td,.zebra-striped tbody tr:hover th{background-color:#f5f5f5;}
+table .header{cursor:pointer;}table .header:after{content:"";float:right;margin-top:7px;border-width:0 4px 4px;border-style:solid;border-color:#000 transparent;visibility:hidden;}
+table .headerSortUp,table .headerSortDown{background-color:rgba(141, 192, 219, 0.25);text-shadow:0 1px 1px rgba(255, 255, 255, 0.75);}
+table .header:hover:after{visibility:visible;}
+table .headerSortDown:after,table .headerSortDown:hover:after{visibility:visible;filter:alpha(opacity=60);-khtml-opacity:0.6;-moz-opacity:0.6;opacity:0.6;}
+table .headerSortUp:after{border-bottom:none;border-left:4px solid transparent;border-right:4px solid transparent;border-top:4px solid #000;visibility:visible;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;filter:alpha(opacity=60);-khtml-opacity:0.6;-moz-opacity:0.6;opacity:0.6;}
+table .blue{color:#049cdb;border-bottom-color:#049cdb;}
+table .headerSortUp.blue,table .headerSortDown.blue{background-color:#ade6fe;}
+table .green{color:#46a546;border-bottom-color:#46a546;}
+table .headerSortUp.green,table .headerSortDown.green{background-color:#cdeacd;}
+table .red{color:#9d261d;border-bottom-color:#9d261d;}
+table .headerSortUp.red,table .headerSortDown.red{background-color:#f4c8c5;}
+table .yellow{color:#ffc40d;border-bottom-color:#ffc40d;}
+table .headerSortUp.yellow,table .headerSortDown.yellow{background-color:#fff6d9;}
+table .orange{color:#f89406;border-bottom-color:#f89406;}
+table .headerSortUp.orange,table .headerSortDown.orange{background-color:#fee9cc;}
+table .purple{color:#7a43b6;border-bottom-color:#7a43b6;}
+table .headerSortUp.purple,table .headerSortDown.purple{background-color:#e2d5f0;}
+.topbar{height:40px;position:fixed;top:0;left:0;right:0;z-index:10000;overflow:visible;}.topbar a{color:#bfbfbf;text-shadow:0 -1px 0 rgba(0, 0, 0, 0.25);}
+.topbar h3 a:hover,.topbar .brand:hover,.topbar ul .active>a{background-color:#333;background-color:rgba(255, 255, 255, 0.05);color:#ffffff;text-decoration:none;}
+.topbar h3{position:relative;}
+.topbar h3 a,.topbar .brand{float:left;display:block;padding:8px 20px 12px;margin-left:-20px;color:#ffffff;font-size:20px;font-weight:200;line-height:1;}
+.topbar p{margin:0;line-height:40px;}.topbar p a:hover{background-color:transparent;color:#ffffff;}
+.topbar form{float:left;margin:5px 0 0 0;position:relative;filter:alpha(opacity=100);-khtml-opacity:1;-moz-opacity:1;opacity:1;}
+.topbar form.pull-right{float:right;}
+.topbar input{background-color:#444;background-color:rgba(255, 255, 255, 0.3);font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:normal;font-weight:13px;line-height:1;padding:4px 9px;color:#ffffff;color:rgba(255, 255, 255, 0.75);border:1px solid #111;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.1),0 1px 0px rgba(255, 255, 255, 0.25);-moz-box-shadow:inset 0 1px 2px rgba(0, 0, 0, 0.1),0 1px 0px rgba(255 [...]
+.topbar input::-webkit-input-placeholder{color:#e6e6e6;}
+.topbar input:hover{background-color:#bfbfbf;background-color:rgba(255, 255, 255, 0.5);color:#ffffff;}
+.topbar input:focus,.topbar input.focused{outline:0;background-color:#ffffff;color:#404040;text-shadow:0 1px 0 #ffffff;border:0;padding:5px 10px;-webkit-box-shadow:0 0 3px rgba(0, 0, 0, 0.15);-moz-box-shadow:0 0 3px rgba(0, 0, 0, 0.15);box-shadow:0 0 3px rgba(0, 0, 0, 0.15);}
+.topbar-inner,.topbar .fill{background-color:#222;background-color:#222222;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#333333), to(#222222));background-image:-moz-linear-gradient(top, #333333, #222222);background-image:-ms-linear-gradient(top, #333333, #222222);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #333333), color-stop(100%, #222222));background-image:-webkit-linear-gradient(top, #333333, #2222 [...]
+.topbar div>ul,.nav{display:block;float:left;margin:0 10px 0 0;position:relative;left:0;}.topbar div>ul>li,.nav>li{display:block;float:left;}
+.topbar div>ul a,.nav a{display:block;float:none;padding:10px 10px 11px;line-height:19px;text-decoration:none;}.topbar div>ul a:hover,.nav a:hover{color:#ffffff;text-decoration:none;}
+.topbar div>ul .active>a,.nav .active>a{background-color:#222;background-color:rgba(0, 0, 0, 0.5);}
+.topbar div>ul.secondary-nav,.nav.secondary-nav{float:right;margin-left:10px;margin-right:0;}.topbar div>ul.secondary-nav .menu-dropdown,.nav.secondary-nav .menu-dropdown,.topbar div>ul.secondary-nav .dropdown-menu,.nav.secondary-nav .dropdown-menu{right:0;border:0;}
+.topbar div>ul a.menu:hover,.nav a.menu:hover,.topbar div>ul li.open .menu,.nav li.open .menu,.topbar div>ul .dropdown-toggle:hover,.nav .dropdown-toggle:hover,.topbar div>ul .dropdown.open .dropdown-toggle,.nav .dropdown.open .dropdown-toggle{background:#444;background:rgba(255, 255, 255, 0.05);}
+.topbar div>ul .menu-dropdown,.nav .menu-dropdown,.topbar div>ul .dropdown-menu,.nav .dropdown-menu{background-color:#333;}.topbar div>ul .menu-dropdown a.menu,.nav .menu-dropdown a.menu,.topbar div>ul .dropdown-menu a.menu,.nav .dropdown-menu a.menu,.topbar div>ul .menu-dropdown .dropdown-toggle,.nav .menu-dropdown .dropdown-toggle,.topbar div>ul .dropdown-menu .dropdown-toggle,.nav .dropdown-menu .dropdown-toggle{color:#ffffff;}.topbar div>ul .menu-dropdown a.menu.open,.nav .menu-dropd [...]
+.topbar div>ul .menu-dropdown li a,.nav .menu-dropdown li a,.topbar div>ul .dropdown-menu li a,.nav .dropdown-menu li a{color:#999;text-shadow:0 1px 0 rgba(0, 0, 0, 0.5);}.topbar div>ul .menu-dropdown li a:hover,.nav .menu-dropdown li a:hover,.topbar div>ul .dropdown-menu li a:hover,.nav .dropdown-menu li a:hover{background-color:#191919;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#292929), to(#191919));background-image:-moz-linear-grad [...]
+.topbar div>ul .menu-dropdown .active a,.nav .menu-dropdown .active a,.topbar div>ul .dropdown-menu .active a,.nav .dropdown-menu .active a{color:#ffffff;}
+.topbar div>ul .menu-dropdown .divider,.nav .menu-dropdown .divider,.topbar div>ul .dropdown-menu .divider,.nav .dropdown-menu .divider{background-color:#222;border-color:#444;}
+.topbar ul .menu-dropdown li a,.topbar ul .dropdown-menu li a{padding:4px 15px;}
+li.menu,.dropdown{position:relative;}
+a.menu:after,.dropdown-toggle:after{width:0;height:0;display:inline-block;content:"&darr;";text-indent:-99999px;vertical-align:top;margin-top:8px;margin-left:4px;border-left:4px solid transparent;border-right:4px solid transparent;border-top:4px solid #ffffff;filter:alpha(opacity=50);-khtml-opacity:0.5;-moz-opacity:0.5;opacity:0.5;}
+.menu-dropdown,.dropdown-menu{background-color:#ffffff;float:left;display:none;position:absolute;top:40px;z-index:900;min-width:160px;max-width:220px;_width:160px;margin-left:0;margin-right:0;padding:6px 0;zoom:1;border-color:#999;border-color:rgba(0, 0, 0, 0.2);border-style:solid;border-width:0 1px 1px;-webkit-border-radius:0 0 6px 6px;-moz-border-radius:0 0 6px 6px;border-radius:0 0 6px 6px;-webkit-box-shadow:0 2px 4px rgba(0, 0, 0, 0.2);-moz-box-shadow:0 2px 4px rgba(0, 0, 0, 0.2);box [...]
+.menu-dropdown .divider,.dropdown-menu .divider{height:1px;margin:5px 0;overflow:hidden;background-color:#eee;border-bottom:1px solid #ffffff;}
+.topbar .dropdown-menu a,.dropdown-menu a{display:block;padding:4px 15px;clear:both;font-weight:normal;line-height:18px;color:#808080;text-shadow:0 1px 0 #ffffff;}.topbar .dropdown-menu a:hover,.dropdown-menu a:hover,.topbar .dropdown-menu a.hover,.dropdown-menu a.hover{background-color:#dddddd;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#eeeeee), to(#dddddd));background-image:-moz-linear-gradient(top, #eeeeee, #dddddd);background-image [...]
+.open .menu,.dropdown.open .menu,.open .dropdown-toggle,.dropdown.open .dropdown-toggle{color:#ffffff;background:#ccc;background:rgba(0, 0, 0, 0.3);}
+.open .menu-dropdown,.dropdown.open .menu-dropdown,.open .dropdown-menu,.dropdown.open .dropdown-menu{display:block;}
+.tabs,.pills{margin:0 0 18px;padding:0;list-style:none;zoom:1;}.tabs:before,.pills:before,.tabs:after,.pills:after{display:table;content:"";zoom:1;}
+.tabs:after,.pills:after{clear:both;}
+.tabs>li,.pills>li{float:left;}.tabs>li>a,.pills>li>a{display:block;}
+.tabs{border-color:#ddd;border-style:solid;border-width:0 0 1px;}.tabs>li{position:relative;margin-bottom:-1px;}.tabs>li>a{padding:0 15px;margin-right:2px;line-height:34px;border:1px solid transparent;-webkit-border-radius:4px 4px 0 0;-moz-border-radius:4px 4px 0 0;border-radius:4px 4px 0 0;}.tabs>li>a:hover{text-decoration:none;background-color:#eee;border-color:#eee #eee #ddd;}
+.tabs .active>a,.tabs .active>a:hover{color:#808080;background-color:#ffffff;border:1px solid #ddd;border-bottom-color:transparent;cursor:default;}
+.tabs .menu-dropdown,.tabs .dropdown-menu{top:35px;border-width:1px;-webkit-border-radius:0 6px 6px 6px;-moz-border-radius:0 6px 6px 6px;border-radius:0 6px 6px 6px;}
+.tabs a.menu:after,.tabs .dropdown-toggle:after{border-top-color:#999;margin-top:15px;margin-left:5px;}
+.tabs li.open.menu .menu,.tabs .open.dropdown .dropdown-toggle{border-color:#999;}
+.tabs li.open a.menu:after,.tabs .dropdown.open .dropdown-toggle:after{border-top-color:#555;}
+.pills a{margin:5px 3px 5px 0;padding:0 15px;line-height:30px;text-shadow:0 1px 1px #ffffff;-webkit-border-radius:15px;-moz-border-radius:15px;border-radius:15px;}.pills a:hover{color:#ffffff;text-decoration:none;text-shadow:0 1px 1px rgba(0, 0, 0, 0.25);background-color:#00438a;}
+.pills .active a{color:#ffffff;text-shadow:0 1px 1px rgba(0, 0, 0, 0.25);background-color:#0069d6;}
+.pills-vertical>li{float:none;}
+.tab-content>.tab-pane,.pill-content>.pill-pane,.tab-content>div,.pill-content>div{display:none;}
+.tab-content>.active,.pill-content>.active{display:block;}
+.breadcrumb{padding:7px 14px;margin:0 0 18px;background-color:#f5f5f5;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#ffffff), to(#f5f5f5));background-image:-moz-linear-gradient(top, #ffffff, #f5f5f5);background-image:-ms-linear-gradient(top, #ffffff, #f5f5f5);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #ffffff), color-stop(100%, #f5f5f5));background-image:-webkit-linear-gradient(top, #ffffff, #f5f5f5);b [...]
+.breadcrumb .divider{padding:0 5px;color:#bfbfbf;}
+.breadcrumb .active a{color:#404040;}
+.hero-unit{background-color:#f5f5f5;margin-bottom:30px;padding:60px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;}.hero-unit h1{margin-bottom:0;font-size:60px;line-height:1;letter-spacing:-1px;}
+.hero-unit p{font-size:18px;font-weight:200;line-height:27px;}
+footer{margin-top:17px;padding-top:17px;border-top:1px solid #eee;}
+.page-header{margin-bottom:17px;border-bottom:1px solid #ddd;-webkit-box-shadow:0 1px 0 rgba(255, 255, 255, 0.5);-moz-box-shadow:0 1px 0 rgba(255, 255, 255, 0.5);box-shadow:0 1px 0 rgba(255, 255, 255, 0.5);}.page-header h1{margin-bottom:8px;}
+.btn.danger,.alert-message.danger,.btn.danger:hover,.alert-message.danger:hover,.btn.error,.alert-message.error,.btn.error:hover,.alert-message.error:hover,.btn.success,.alert-message.success,.btn.success:hover,.alert-message.success:hover,.btn.info,.alert-message.info,.btn.info:hover,.alert-message.info:hover{color:#ffffff;}
+.btn .close,.alert-message .close{font-family:Arial,sans-serif;line-height:18px;}
+.btn.danger,.alert-message.danger,.btn.error,.alert-message.error{background-color:#c43c35;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#ee5f5b), to(#c43c35));background-image:-moz-linear-gradient(top, #ee5f5b, #c43c35);background-image:-ms-linear-gradient(top, #ee5f5b, #c43c35);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #ee5f5b), color-stop(100%, #c43c35));background-image:-webkit-linear-gradient(top [...]
+.btn.success,.alert-message.success{background-color:#57a957;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#62c462), to(#57a957));background-image:-moz-linear-gradient(top, #62c462, #57a957);background-image:-ms-linear-gradient(top, #62c462, #57a957);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #62c462), color-stop(100%, #57a957));background-image:-webkit-linear-gradient(top, #62c462, #57a957);background [...]
+.btn.info,.alert-message.info{background-color:#339bb9;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#5bc0de), to(#339bb9));background-image:-moz-linear-gradient(top, #5bc0de, #339bb9);background-image:-ms-linear-gradient(top, #5bc0de, #339bb9);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #5bc0de), color-stop(100%, #339bb9));background-image:-webkit-linear-gradient(top, #5bc0de, #339bb9);background-image [...]
+.btn{cursor:pointer;display:inline-block;background-color:#e6e6e6;background-repeat:no-repeat;background-image:-webkit-gradient(linear, 0 0, 0 100%, from(#ffffff), color-stop(25%, #ffffff), to(#e6e6e6));background-image:-webkit-linear-gradient(#ffffff, #ffffff 25%, #e6e6e6);background-image:-moz-linear-gradient(top, #ffffff, #ffffff 25%, #e6e6e6);background-image:-ms-linear-gradient(#ffffff, #ffffff 25%, #e6e6e6);background-image:-o-linear-gradient(#ffffff, #ffffff 25%, #e6e6e6);backgrou [...]
+.btn:focus{outline:1px dotted #666;}
+.btn.primary{color:#ffffff;background-color:#0064cd;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#049cdb), to(#0064cd));background-image:-moz-linear-gradient(top, #049cdb, #0064cd);background-image:-ms-linear-gradient(top, #049cdb, #0064cd);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #049cdb), color-stop(100%, #0064cd));background-image:-webkit-linear-gradient(top, #049cdb, #0064cd);background-image:-o [...]
+.btn.active,.btn:active{-webkit-box-shadow:inset 0 2px 4px rgba(0, 0, 0, 0.25),0 1px 2px rgba(0, 0, 0, 0.05);-moz-box-shadow:inset 0 2px 4px rgba(0, 0, 0, 0.25),0 1px 2px rgba(0, 0, 0, 0.05);box-shadow:inset 0 2px 4px rgba(0, 0, 0, 0.25),0 1px 2px rgba(0, 0, 0, 0.05);}
+.btn.disabled{cursor:default;background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=65);-khtml-opacity:0.65;-moz-opacity:0.65;opacity:0.65;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;}
+.btn[disabled]{cursor:default;background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=65);-khtml-opacity:0.65;-moz-opacity:0.65;opacity:0.65;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;}
+.btn.large{font-size:15px;line-height:normal;padding:9px 14px 9px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;}
+.btn.small{padding:7px 9px 7px;font-size:11px;}
+:root .alert-message,:root .btn{border-radius:0 \0;}
+button.btn::-moz-focus-inner,input[type=submit].btn::-moz-focus-inner{padding:0;border:0;}
+.close{float:right;color:#000000;font-size:20px;font-weight:bold;line-height:13.5px;text-shadow:0 1px 0 #ffffff;filter:alpha(opacity=25);-khtml-opacity:0.25;-moz-opacity:0.25;opacity:0.25;}.close:hover{color:#000000;text-decoration:none;filter:alpha(opacity=40);-khtml-opacity:0.4;-moz-opacity:0.4;opacity:0.4;}
+.alert-message{position:relative;padding:7px 15px;margin-bottom:18px;color:#404040;background-color:#eedc94;background-repeat:repeat-x;background-image:-khtml-gradient(linear, left top, left bottom, from(#fceec1), to(#eedc94));background-image:-moz-linear-gradient(top, #fceec1, #eedc94);background-image:-ms-linear-gradient(top, #fceec1, #eedc94);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #fceec1), color-stop(100%, #eedc94));background-image:-webkit-li [...]
+.alert-message a{font-weight:bold;color:#404040;}
+.alert-message.danger p a,.alert-message.error p a,.alert-message.success p a,.alert-message.info p a{color:#ffffff;}
+.alert-message h5{line-height:18px;}
+.alert-message p{margin-bottom:0;}
+.alert-message div{margin-top:5px;margin-bottom:2px;line-height:28px;}
+.alert-message .btn{-webkit-box-shadow:0 1px 0 rgba(255, 255, 255, 0.25);-moz-box-shadow:0 1px 0 rgba(255, 255, 255, 0.25);box-shadow:0 1px 0 rgba(255, 255, 255, 0.25);}
+.alert-message.block-message{background-image:none;background-color:#fdf5d9;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);padding:14px;border-color:#fceec1;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none;}.alert-message.block-message ul,.alert-message.block-message p{margin-right:30px;}
+.alert-message.block-message ul{margin-bottom:0;}
+.alert-message.block-message li{color:#404040;}
+.alert-message.block-message .alert-actions{margin-top:5px;}
+.alert-message.block-message.error,.alert-message.block-message.success,.alert-message.block-message.info{color:#404040;text-shadow:0 1px 0 rgba(255, 255, 255, 0.5);}
+.alert-message.block-message.error{background-color:#fddfde;border-color:#fbc7c6;}
+.alert-message.block-message.success{background-color:#d1eed1;border-color:#bfe7bf;}
+.alert-message.block-message.info{background-color:#ddf4fb;border-color:#c6edf9;}
+.alert-message.block-message.danger p a,.alert-message.block-message.error p a,.alert-message.block-message.success p a,.alert-message.block-message.info p a{color:#404040;}
+.pagination{height:36px;margin:18px 0;}.pagination ul{float:left;margin:0;border:1px solid #ddd;border:1px solid rgba(0, 0, 0, 0.15);-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;-webkit-box-shadow:0 1px 2px rgba(0, 0, 0, 0.05);-moz-box-shadow:0 1px 2px rgba(0, 0, 0, 0.05);box-shadow:0 1px 2px rgba(0, 0, 0, 0.05);}
+.pagination li{display:inline;}
+.pagination a{float:left;padding:0 14px;line-height:34px;border-right:1px solid;border-right-color:#ddd;border-right-color:rgba(0, 0, 0, 0.15);*border-right-color:#ddd;text-decoration:none;}
+.pagination a:hover,.pagination .active a{background-color:#c7eefe;}
+.pagination .disabled a,.pagination .disabled a:hover{background-color:transparent;color:#bfbfbf;}
+.pagination .next a{border:0;}
+.well{background-color:#f5f5f5;margin-bottom:20px;padding:19px;min-height:20px;border:1px solid #eee;border:1px solid rgba(0, 0, 0, 0.05);-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.05);-moz-box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.05);box-shadow:inset 0 1px 1px rgba(0, 0, 0, 0.05);}.well blockquote{border-color:#ddd;border-color:rgba(0, 0, 0, 0.15);}
+.modal-backdrop{background-color:#000000;position:fixed;top:0;left:0;right:0;bottom:0;z-index:10000;}.modal-backdrop.fade{opacity:0;}
+.modal-backdrop,.modal-backdrop.fade.in{filter:alpha(opacity=80);-khtml-opacity:0.8;-moz-opacity:0.8;opacity:0.8;}
+.modal{position:fixed;top:50%;left:50%;z-index:11000;width:560px;margin:-250px 0 0 -280px;background-color:#ffffff;border:1px solid #999;border:1px solid rgba(0, 0, 0, 0.3);*border:1px solid #999;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;-webkit-box-shadow:0 3px 7px rgba(0, 0, 0, 0.3);-moz-box-shadow:0 3px 7px rgba(0, 0, 0, 0.3);box-shadow:0 3px 7px rgba(0, 0, 0, 0.3);-webkit-background-clip:padding-box;-moz-background-clip:padding-box;background-clip:padding-box [...]
+.modal.fade{-webkit-transition:opacity .3s linear, top .3s ease-out;-moz-transition:opacity .3s linear, top .3s ease-out;-ms-transition:opacity .3s linear, top .3s ease-out;-o-transition:opacity .3s linear, top .3s ease-out;transition:opacity .3s linear, top .3s ease-out;top:-25%;}
+.modal.fade.in{top:50%;}
+.modal-header{border-bottom:1px solid #eee;padding:5px 15px;}
+.modal-body{padding:15px;}
+.modal-body form{margin-bottom:0;}
+.modal-footer{background-color:#f5f5f5;padding:14px 15px 15px;border-top:1px solid #ddd;-webkit-border-radius:0 0 6px 6px;-moz-border-radius:0 0 6px 6px;border-radius:0 0 6px 6px;-webkit-box-shadow:inset 0 1px 0 #ffffff;-moz-box-shadow:inset 0 1px 0 #ffffff;box-shadow:inset 0 1px 0 #ffffff;zoom:1;margin-bottom:0;}.modal-footer:before,.modal-footer:after{display:table;content:"";zoom:1;}
+.modal-footer:after{clear:both;}
+.modal-footer .btn{float:right;margin-left:5px;}
+.modal .popover,.modal .twipsy{z-index:12000;}
+.twipsy{display:block;position:absolute;visibility:visible;padding:5px;font-size:11px;z-index:1000;filter:alpha(opacity=80);-khtml-opacity:0.8;-moz-opacity:0.8;opacity:0.8;}.twipsy.fade.in{filter:alpha(opacity=80);-khtml-opacity:0.8;-moz-opacity:0.8;opacity:0.8;}
+.twipsy.above .twipsy-arrow{bottom:0;left:50%;margin-left:-5px;border-left:5px solid transparent;border-right:5px solid transparent;border-top:5px solid #000000;}
+.twipsy.left .twipsy-arrow{top:50%;right:0;margin-top:-5px;border-top:5px solid transparent;border-bottom:5px solid transparent;border-left:5px solid #000000;}
+.twipsy.below .twipsy-arrow{top:0;left:50%;margin-left:-5px;border-left:5px solid transparent;border-right:5px solid transparent;border-bottom:5px solid #000000;}
+.twipsy.right .twipsy-arrow{top:50%;left:0;margin-top:-5px;border-top:5px solid transparent;border-bottom:5px solid transparent;border-right:5px solid #000000;}
+.twipsy-inner{padding:3px 8px;background-color:#000000;color:white;text-align:center;max-width:200px;text-decoration:none;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;}
+.twipsy-arrow{position:absolute;width:0;height:0;}
+.popover{position:absolute;top:0;left:0;z-index:1000;padding:5px;display:none;}.popover.above .arrow{bottom:0;left:50%;margin-left:-5px;border-left:5px solid transparent;border-right:5px solid transparent;border-top:5px solid #000000;}
+.popover.right .arrow{top:50%;left:0;margin-top:-5px;border-top:5px solid transparent;border-bottom:5px solid transparent;border-right:5px solid #000000;}
+.popover.below .arrow{top:0;left:50%;margin-left:-5px;border-left:5px solid transparent;border-right:5px solid transparent;border-bottom:5px solid #000000;}
+.popover.left .arrow{top:50%;right:0;margin-top:-5px;border-top:5px solid transparent;border-bottom:5px solid transparent;border-left:5px solid #000000;}
+.popover .arrow{position:absolute;width:0;height:0;}
+.popover .inner{background:#000000;background:rgba(0, 0, 0, 0.8);padding:3px;overflow:hidden;width:280px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;-webkit-box-shadow:0 3px 7px rgba(0, 0, 0, 0.3);-moz-box-shadow:0 3px 7px rgba(0, 0, 0, 0.3);box-shadow:0 3px 7px rgba(0, 0, 0, 0.3);}
+.popover .title{background-color:#f5f5f5;padding:9px 15px;line-height:1;-webkit-border-radius:3px 3px 0 0;-moz-border-radius:3px 3px 0 0;border-radius:3px 3px 0 0;border-bottom:1px solid #eee;}
+.popover .content{background-color:#ffffff;padding:14px;-webkit-border-radius:0 0 3px 3px;-moz-border-radius:0 0 3px 3px;border-radius:0 0 3px 3px;-webkit-background-clip:padding-box;-moz-background-clip:padding-box;background-clip:padding-box;}.popover .content p,.popover .content ul,.popover .content ol{margin-bottom:0;}
+.fade{-webkit-transition:opacity 0.15s linear;-moz-transition:opacity 0.15s linear;-ms-transition:opacity 0.15s linear;-o-transition:opacity 0.15s linear;transition:opacity 0.15s linear;opacity:0;}.fade.in{opacity:1;}
+.label{padding:1px 3px 2px;font-size:9.75px;font-weight:bold;color:#ffffff;text-transform:uppercase;white-space:nowrap;background-color:#bfbfbf;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;}.label.important{background-color:#c43c35;}
+.label.warning{background-color:#f89406;}
+.label.success{background-color:#46a546;}
+.label.notice{background-color:#62cffc;}
+.media-grid{margin-left:-20px;margin-bottom:0;zoom:1;}.media-grid:before,.media-grid:after{display:table;content:"";zoom:1;}
+.media-grid:after{clear:both;}
+.media-grid li{display:inline;}
+.media-grid a{float:left;padding:4px;margin:0 0 18px 20px;border:1px solid #ddd;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:0 1px 1px rgba(0, 0, 0, 0.075);-moz-box-shadow:0 1px 1px rgba(0, 0, 0, 0.075);box-shadow:0 1px 1px rgba(0, 0, 0, 0.075);}.media-grid a img{display:block;}
+.media-grid a:hover{border-color:#0069d6;-webkit-box-shadow:0 1px 4px rgba(0, 105, 214, 0.25);-moz-box-shadow:0 1px 4px rgba(0, 105, 214, 0.25);box-shadow:0 1px 4px rgba(0, 105, 214, 0.25);}
\ No newline at end of file
diff --git a/assets/themes/custom-twitter/css/style.css b/assets/themes/custom-twitter/css/style.css
new file mode 100644
index 0000000..3dadbd9
--- /dev/null
+++ b/assets/themes/custom-twitter/css/style.css
@@ -0,0 +1,417 @@
+/* Override some defaults */
+html, body {
+
+}
+body {
+  padding-top: 60px; /* 40px to make the container go all the way to the bottom of the topbar */
+}
+.container > footer p {
+  text-align: center; /* center align it with the container */
+}
+
+.container {
+  width: 970px;
+}
+
+#tvm-banner-container {
+  width: 100%;
+  height: 600px;
+  margin-top: -4px;
+  background-repeat: no-repeat, no-repeat, no-repeat;
+  background-position: 4% 60%,  96% 60%, center;
+  background-size: 22%, 20%, 100%;
+  background-image:
+     url("/images/logo/tvm-banner-left-objs-white.svg"),
+     url("/images/logo/tvm-banner-right-objs-white.svg"),
+     linear-gradient(175deg, #46A0F8, #333333);
+}
+
+#tvm-banner-title  {
+  margin: auto;
+  padding: 10% 0px;
+  width: 70%;
+}
+
+#tvm-banner-title p {
+  font-family: -apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";
+  font-size: 43px;
+  font-weight: 200;
+  color: white;
+  line-height: 50px;
+  text-align: center;
+}
+
+#tvm-banner-subtitle p {
+  font-size: 24px;
+}
+
+#banner-btn {
+  border: 2px solid white;
+  font-size: 20px;
+  margin-top: 50px;
+  padding: 0px 34px;
+  color: white;
+  font-weight: 400;
+  background-color: transparent;
+}
+
+#banner-btn:hover {
+  background-color: white;
+  color: #3e658d;
+  font-weight: 400;
+}
+.span14 {
+  width: 970px;
+}
+
+
+/* The white background content wrapper */
+.content {
+  font-family: -apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol";
+  background-color: #fff;
+  padding: 20px;
+  margin: 0 -20px; /* negative indent the amount of the padding to maintain the grid system */
+  -webkit-border-radius: 0 0 6px 6px;
+     -moz-border-radius: 0 0 6px 6px;
+          border-radius: 0 0 6px 6px;
+  /*-webkit-box-shadow: 0 1px 2px rgba(0,0,0,.15);
+     -moz-box-shadow: 0 1px 2px rgba(0,0,0,.15);
+          box-shadow: 0 1px 2px rgba(0,0,0,.15);*/
+}
+
+.link-btn {
+    background-color: white;
+    border: 2px solid #555555;
+    border-radius: 8px;
+    color: white;
+    padding: 8px 12px;
+    text-align: center;
+    text-decoration: none;
+    display: inline-block;
+    font-size: 16px;
+    margin: 3px 1px;
+    -webkit-transition-duration: 0.1s; /* Safari */
+    transition-duration: 0.1s;
+    cursor: pointer;
+    color: black;
+}
+
+.link-btn:hover {
+    background-color: #555555;
+    color: white;
+    text-decoration: none;
+}
+
+/* Page header tweaks */
+.page-header {
+  background-color: #f5f5f5;
+  padding: 20px 20px 10px;
+  margin: -20px -20px 20px;
+}
+
+.content p {
+   font-size: 16px;
+   line-height: 24px;
+}
+
+.content ul {
+   margin-top:8px;
+   font-size: 16px;
+   line-height: 24px;
+}
+
+.content li {
+   margin-bottom: 7px;
+}
+
+.content h1 {
+   font-size:28px;
+}
+
+/* navigation, topbar */
+
+#logo-wrap {
+   float: left;
+   padding-top: 5px;
+   margin-top: 8px;
+   margin-bottom: 8px;
+}
+
+#nav-bar {
+  margin-left: 20px;
+  font-size: 16px;
+  font-family: "Helvetica Neue",Helvetica,Arial,sans-serif;
+  height: 50px;
+}
+
+#nav-bar li {
+  margin-top: 10px;
+  margin-right:15px;
+}
+
+/* blog */
+.bloglist ul {
+  margin-top: 10px;
+}
+
+.bloglist li {
+  margin-top: 4px;
+  margin-bottom: 25px;
+}
+
+.topbar .btn {
+  border: 0px;
+}
+
+/* active color use default */
+.topbar div>ul .active>a, .nav .active>a {
+   background-color: rgb(42, 42, 42);
+}
+
+/* tag_box ======================================================== */
+
+.community_logo img {
+  margin: 10px;
+}
+
+.tag_box {
+	list-style:none;
+	margin:0;
+	padding:5px 0 ;
+	overflow:hidden;
+}
+.tag_box li {
+	line-height:28px;
+}
+.tag_box.inline li {
+	float:left;
+}
+.tag_box a {
+	padding: 3px 6px;
+	margin: 2px;
+	background: #eee;
+	color:#005F6B;
+	border-radius: 3px;
+	text-decoration:none;
+}
+.tag_box a span{
+	vertical-align:super;
+	font-size:0.8em;
+}
+
+.tag_box a.active {
+	background:#57A957;
+	border:1px solid #4C964D;
+	color:#FFF;
+}
+
+
+.highlight table td { padding: 5px; }
+.highlight table pre { margin: 0; }
+.highlight .cm {
+  color: #999988;
+  font-style: italic;
+}
+.highlight .cp {
+  color: #999999;
+  font-weight: bold;
+}
+.highlight .c1 {
+  color: #999988;
+  font-style: italic;
+}
+.highlight .cs {
+  color: #999999;
+  font-weight: bold;
+  font-style: italic;
+}
+.highlight .c, .highlight .cd {
+  color: #999988;
+  font-style: italic;
+}
+.highlight .err {
+  color: #a61717;
+  background-color: #e3d2d2;
+}
+.highlight .gd {
+  color: #000000;
+  background-color: #ffdddd;
+}
+.highlight .ge {
+  color: #000000;
+  font-style: italic;
+}
+.highlight .gr {
+  color: #aa0000;
+}
+.highlight .gh {
+  color: #999999;
+}
+.highlight .gi {
+  color: #000000;
+  background-color: #ddffdd;
+}
+.highlight .go {
+  color: #888888;
+}
+.highlight .gp {
+  color: #555555;
+}
+.highlight .gs {
+  font-weight: bold;
+}
+.highlight .gu {
+  color: #aaaaaa;
+}
+.highlight .gt {
+  color: #aa0000;
+}
+.highlight .kc {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .kd {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .kn {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .kp {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .kr {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .kt {
+  color: #445588;
+  font-weight: bold;
+}
+.highlight .k, .highlight .kv {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .mf {
+  color: #009999;
+}
+.highlight .mh {
+  color: #009999;
+}
+.highlight .il {
+  color: #009999;
+}
+.highlight .mi {
+  color: #009999;
+}
+.highlight .mo {
+  color: #009999;
+}
+.highlight .m, .highlight .mb, .highlight .mx {
+  color: #009999;
+}
+.highlight .sb {
+  color: #d14;
+}
+.highlight .sc {
+  color: #d14;
+}
+.highlight .sd {
+  color: #d14;
+}
+.highlight .s2 {
+  color: #d14;
+}
+.highlight .se {
+  color: #d14;
+}
+.highlight .sh {
+  color: #d14;
+}
+.highlight .si {
+  color: #d14;
+}
+.highlight .sx {
+  color: #d14;
+}
+.highlight .sr {
+  color: #009926;
+}
+.highlight .s1 {
+  color: #d14;
+}
+.highlight .ss {
+  color: #990073;
+}
+.highlight .s {
+  color: #d14;
+}
+.highlight .na {
+  color: #008080;
+}
+.highlight .bp {
+  color: #999999;
+}
+.highlight .nb {
+  color: #0086B3;
+}
+.highlight .nc {
+  color: #445588;
+  font-weight: bold;
+}
+.highlight .no {
+  color: #008080;
+}
+.highlight .nd {
+  color: #3c5d5d;
+  font-weight: bold;
+}
+.highlight .ni {
+  color: #800080;
+}
+.highlight .ne {
+  color: #990000;
+  font-weight: bold;
+}
+.highlight .nf {
+  color: #990000;
+  font-weight: bold;
+}
+.highlight .nl {
+  color: #990000;
+  font-weight: bold;
+}
+.highlight .nn {
+  color: #555555;
+}
+.highlight .nt {
+  color: #000080;
+}
+.highlight .vc {
+  color: #008080;
+}
+.highlight .vg {
+  color: #008080;
+}
+.highlight .vi {
+  color: #008080;
+}
+.highlight .nv {
+  color: #008080;
+}
+.highlight .ow {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .o {
+  color: #000000;
+  font-weight: bold;
+}
+.highlight .w {
+  color: #bbbbbb;
+}
+.highlight {
+  background-color: #f8f8f8;
+}
diff --git a/atom.xml b/atom.xml
new file mode 100644
index 0000000..f9c1b46
--- /dev/null
+++ b/atom.xml
@@ -0,0 +1,3607 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+ 
+ <title>TVM</title>
+ <link href="https://tvm.apache.org" rel="self"/>
+ <link href="https://tvm.apache.org"/>
+ <updated>2019-11-25T21:33:32-08:00</updated>
+ <id>https://tvm.apache.org</id>
+ <author>
+   <name></name>
+   <email></email>
+ </author>
+
+ 
+ <entry>
+   <title>Integrating TVM into PyTorch</title>
+   <link href="https://tvm.apache.org/2019/05/30/pytorch-frontend"/>
+   <updated>2019-05-30T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2019/05/30/pytorch-frontend</id>
+   <content type="html">&lt;p&gt;As TVM continuously demonstrates improvements to the efficiency of deep learning execution,
+it has become clear that PyTorch stands to benefit from directly leveraging the compiler stack.
+A major tenet of PyTorch is providing seamless and robust integrations that don’t get in the user’s way.
+To that end, PyTorch now has an official TVM-based backend, &lt;a href=&quot;https://github.com/pytorch/tvm&quot;&gt;torch_tvm&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;Usage is simple:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
+torch_tvm.enable()
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;That’s it!  PyTorch will then attempt to convert all operators it can to known Relay operators during its JIT compilation process.&lt;/p&gt;
+
+&lt;h3 id=&quot;background&quot;&gt;Background&lt;/h3&gt;
+
+&lt;p&gt;Unlike many other ML frameworks, PyTorch exposes an eager-execution programming interface.  This style of programming avoids graph-based meta-programming and focuses on the direct manipulation of n-dimensional arrays (tensors) in a Pythonic way.  As such, the framework was initially well suited for the experimentation and development of models, but not for automatic performance optimization or deployment.  To leverage optimizing compiler techniques, some large changes were recen [...]
+
+&lt;p&gt;&lt;img src=&quot;https://i.imgur.com/4XVHbJE.png&quot; alt=&quot;TVM Integration&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;PyTorch 1.0 introduced PyTorch IR, a PyTorch-specific intermediate representation for models similar to Relay.  PyTorch programs can be converted into the IR via model tracing, which records the execution of a model or TorchScript, a subset of Python.  The new TVM backend lowers PyTorch IR to Relay, and is able to transparently improve PyTorch performance with little user involvement.&lt;/p&gt;
+
+&lt;h3 id=&quot;integration-and-results&quot;&gt;Integration and Results&lt;/h3&gt;
+
+&lt;p&gt;To support Relay, two features were added to the PyTorch JIT: custom transformation passes and custom subgraph interpreters.&lt;/p&gt;
+
+&lt;p&gt;When &lt;code class=&quot;highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; is enabled, subgraphs of PyTorch IR that can be converted to Relay &lt;code class=&quot;highlighter-rouge&quot;&gt;Expr&lt;/code&gt;s will be marked as Relay-compatible.  Since PyTorch IR does not always contain shape information, none of the subgraphs can be compiled in a useful way before invocation.&lt;/p&gt;
+
+&lt;p&gt;During user invocation, the PyTorch JIT runtime will determine input shape information and compile the previously marked subgraphs with the new Relay C++ &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/torch_tvm/compiler.cpp#L226-L246&quot;&gt;build system&lt;/a&gt;.  The compilation is cached based on input shapes for subsequent runs.  More details can be found in the &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/README.md&quot;&gt;README&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; has a continuous benchmark system set up, which is monitoring the performance of ResNet18 on CPU.
+Out of the box TVM provides over two times the performance of the default PyTorch JIT backend for various ResNet models.
+Below is a graph that details the iterations per second achieved with 16 threads on an AWS c5n.4xlarge instance (larger is better):&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;https://i.imgur.com/KfJ7oas.png&quot; alt=&quot;bench&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;These results are quite encouraging, and the project will continue to focus on improving CPU inference speed across more models.&lt;/p&gt;
+
+&lt;h3 id=&quot;future-work&quot;&gt;Future work&lt;/h3&gt;
+
+&lt;p&gt;Right now the PyTorch JIT does a lot of work to find pure functional subsets of its IR to feed to Relay.  This avoids the need to map aliasing and control flow information to Relay, but is not necessary.  Mapping more of the PyTorch IR to Relay may yield performance wins and is a goal of the project.  PyTorch IR is rapidly changing as it is being developed, so this must be done carefully.&lt;/p&gt;
+
+&lt;p&gt;More work will be done to ensure the hand off between PyTorch and TVM code is efficient.  This includes unifying the threading model, allocators and reducing the overhead associated with copying inputs into TVM.&lt;/p&gt;
+
+&lt;h3 id=&quot;tutorial&quot;&gt;Tutorial&lt;/h3&gt;
+
+&lt;p&gt;If you have an already written PyTorch model, the easiest way to get started comes from using &lt;code class=&quot;highlighter-rouge&quot;&gt;torch.jit.trace&lt;/code&gt; as follows&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
+from your_model import model, inputs
+
+torch_tvm.enable(opt_level=3)
+
+iters = 100
+warmup = 10
+
+# Ensure your model is in eval mode and also turn off gradients.
+with torch.no_grad():
+  # Use tuned parameters for better performance.
+  with autotvm.apply_history_best(&quot;test/autotvm_tuning.log&quot;):
+    # This is where all the compilation happens.
+    trace_tvm = torch.jit.trace(model, inputs)
+    
+    # Warmup
+    for _ in range(warmup):
+      _ = trace_tvm(*inputs)
+
+    # Benchmark
+    start = time.time()
+    for _ in range(iters):
+      _ = trace_tvm(*inputs)
+    tvm_time = time.time() - start
+    
+    print(&quot;Took {}s to run {} iters&quot;.format(tvm_time, iters))
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Much of this code comes from &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/test/benchmarks.py&quot;&gt;benchmarks.py&lt;/a&gt;.  Note that tuned parameters for AVX2 LLVM compilation is in the &lt;code class=&quot;highlighter-rouge&quot;&gt;test/&lt;/code&gt; folder of the repo.&lt;/p&gt;
+
+&lt;p&gt;If you are more comfortable using Relay directly, it is possible to simply extract the expression directly from a
+PyTorch function either via (implicit) tracing or TorchScript:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;def add(a, b, c):
+    return a + b + c
+
+# via tracing
+relay_graph = torch_tvm.to_relay(add, inputs)
+
+@torch.jit.script
+def mul(a, b, c):
+    return a * b * c
+
+# via script
+relay_graph = torch_tvm.to_relay(mul, inputs)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Automating Optimization of Quantized Deep Learning Models on CUDA</title>
+   <link href="https://tvm.apache.org/2019/04/29/opt-cuda-quantized"/>
+   <updated>2019-04-29T09:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2019/04/29/opt-cuda-quantized</id>
+   <content type="html">&lt;p&gt;Deep learning has been successfully applied to a variety of tasks.
+On real-time scenarios such as inference on autonomous vehicles, the inference speed of the model is critical.
+Network quantization is an effective approach to accelerating deep learning models.
+In quantized models, both data and model parameters are represented with low precision data types such as &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;float16&lt;/code&gt;.
+The lowered data bandwidth reduces the inference time and memory/storage requirements, as well as the power consumption.
+Meanwhile, under proper quantization schemes, we can minimize the accuracy drops of the quantized models.
+Therefore, quantized models are of particular interests of researchers and developers as it makes large models suitable to deploy on diverse devices, such as GPU, CPU and mobile devices.&lt;/p&gt;
+
+&lt;p&gt;Previously, quantized operators are usually optimized with handcrafted microkernels for different workloads, or rely on blackbox proprietary solutions such as cuDNN and TensorRT.
+Writing a high-performance microkernel in assembly can be very challenging and usually requires heavy engineering effort.
+Besides, it is difficult to adapt these ad-hoc microkernels to emerging workloads and new devices.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/cuda-quantized/benchmark.svg&quot; alt=&quot;image&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 1. Inference time of different models on TVM, TensorRT, and MXNet &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM solves this challenge with a full stack compiler and a machine-learning-based optimizer to automatically generate computing kernels.
+TVM can generate efficient kernels via automatic search in a human-designed search space.
+In standard workloads such as VGG and ResNet, TVM achieves competitive performance compared with other state-of-the-art frameworks. 
+In emerging models such as ResNeXt and Deformable ConvNets, the automatic optimization makes it easy for TVM to adapt to these new workloads and achieve a significant performance boost.&lt;/p&gt;
+
+&lt;p&gt;In this post, we show how to use TVM to automatically optimize of quantized deep learning models on CUDA.&lt;/p&gt;
+
+&lt;h1 id=&quot;expressing-quantized-cuda-kernels-in-tvm&quot;&gt;Expressing Quantized CUDA Kernels in TVM&lt;/h1&gt;
+&lt;h2 id=&quot;leveraging-tensor-intrinsics-via-tensorization&quot;&gt;Leveraging Tensor Intrinsics via Tensorization&lt;/h2&gt;
+&lt;p&gt;Many platforms provide architecture-specific instructions for special computation patterns, for example, the SIMD instructions on x86, and the &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;hfma&lt;/code&gt; instructions on CUDA.
+These intrinsic instructions are highly optimized for specific devices.
+By leveraging hardware intrinsics, we can achieve a significant performance boost for quantized operators.&lt;/p&gt;
+
+&lt;p&gt;Currently, &lt;a href=&quot;https://devblogs.nvidia.com/mixed-precision-programming-cuda-8/&quot;&gt;dp4a&lt;/a&gt; has been extensively used in TVM int8 operators on CUDA.
+&lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; is a CUDA intrinsic on Compute Capability 6.1 devices.
+It is a mixed-precision instruction that provides the efficient computation of the dot product between two 4-element 8-bit integer vectors and accumulates the result in 32-bit format.
+Using &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, we can implement a dot product between 8-bit integer vectors with number of elements evenly divisible by four.
+With an efficient dot product operator, we can implement high-level operators such as 2d convolution and dense layers as these operators are commonly backed by dot products.&lt;/p&gt;
+
+&lt;p&gt;To illustrate, in 2d convolution we accumulate along the channel, the width, and the height axis of the kernel.
+This is a typical use case of &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;.
+TVM uses tensorization to support calling external intrinsics.
+We do not need to modify the original computation declaration; we use the schedule primitive &lt;code class=&quot;highlighter-rouge&quot;&gt;tensorize&lt;/code&gt; to replace the accumulation with &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; tensor intrinsic.
+More details of tensorization can be found in the &lt;a href=&quot;https://docs.tvm.ai/tutorials/language/tensorize.html&quot;&gt;tutorial&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h2 id=&quot;data-layout-rearrangement&quot;&gt;Data Layout Rearrangement&lt;/h2&gt;
+&lt;p&gt;One of the challenges in tensorization is that we may need to design special computation logic to adapt to the requirement of tensor intrinsics.
+Although it is natural to accumulate along the inner axis of the tensor in the dense operator, &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; can be more challenging.
+In &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; we expect to take a slice in the channel dimension as the input of &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; because the number of channels is typically multiple of 4 (otherwise we fall back to original &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; in NCHW layout).
+Meanwhile, to achieve memory locality, we would like to reduce along the innermost axis first.
+Taking these factors into account, we use a custom data layout to address this challenge.&lt;/p&gt;
+
+&lt;p&gt;In CUDA int8 2d convolution, we empirically choose &lt;code class=&quot;highlighter-rouge&quot;&gt;NCHW4c&lt;/code&gt; as data layout and &lt;code class=&quot;highlighter-rouge&quot;&gt;OIHW4o4i&lt;/code&gt; as weight layout.
+The templates can also be easily generalized to &lt;code class=&quot;highlighter-rouge&quot;&gt;NCHW[x]c&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;OIHW[x]o[x]i&lt;/code&gt;, where x is an arbitrary positive integer divisible by four.
+In the data layout we choose, slices of channels are in the packed innermost dimension.
+Likewise, we pack slices in both the input and output channel dimensions of the weight so that the output has a consistent data layout with the input, which prevents redundant layout transformations between layers.&lt;/p&gt;
+
+&lt;p&gt;We show the computation of one element of the output of the 2d convolution in Figure 2.
+The element in each position of the super dimension (the outer dimension of the blocked layout which contains packed elements) NCHW and OIHW is the packed input and kernel, respectively.
+Each column of the packed kernel comes from a different filter.
+We calculate the dot product between the packed input and each row in the packed kernel using &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, and accumulate the result to the output tensor.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/cuda-quantized/conv2d.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+&lt;div&gt;
+Figure 2. 2D convolution with data layout in NCHW4c and weight layout in OIHW4o4i.
+&lt;b&gt;Left&lt;/b&gt;: The input tensor in NCHW4c layout. One moving filter of the kernel is colored in blue. One element of the input and kernel is colored in grey. 
+&lt;b&gt;Mid&lt;/b&gt;: The packed input and kernel in the grey block.
+&lt;b&gt;Right&lt;/b&gt;: The output in NCHW4c layout. Inside the one element depicted, there are four packed elements in channel sub-dimension.
+&lt;/div&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;After we have specified the layout of convolution layers, other operators such as &lt;code class=&quot;highlighter-rouge&quot;&gt;add&lt;/code&gt; and activations can automatically adapt to the chosen layout during the &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/src/relay/pass/alter_op_layout.cc&quot;&gt;AlterOpLayout&lt;/a&gt; pass in Relay.
+The layout transformation of the weight can be precomputed offline. Therefore, we can run the whole model in the same layout without extra overhead.&lt;/p&gt;
+
+&lt;h2 id=&quot;designing-search-space-for-automatic-optimization&quot;&gt;Designing Search Space for Automatic Optimization&lt;/h2&gt;
+&lt;p&gt;The key to achieving good performance in our quantized operators is to integrate with machine-learning-based automatic optimization. One question is how to design an effective schedule search space.
+An effective schedule template means that we can obtain good performance in a reasonable number of iterations in automatic tuning.
+Generally speaking, we strive to define a flexible template to cover different configurations in the search space.
+On the other hand, we also take advantage of the prior knowledge in performance optimization.
+For example, as caching data in the shared memory is a common practice in CUDA programming, we utilize shared memory, but we use machine learning to choose the best tile size.
+We also do some manual tiling such as splitting axes by 4 or 16 to facilitate vectorized memory access.&lt;/p&gt;
+
+&lt;p&gt;In quantized 2d convolution, we design a search space that includes a set of tunable options, such as the tile size, the axes to fuse, configurations of loop unrolling and double buffering.
+The templates of quantized &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;dense&lt;/code&gt; on CUDA are registered under template key &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt;.
+During automatic tuning, we can create tuning tasks for these quantized operators by setting the &lt;code class=&quot;highlighter-rouge&quot;&gt;template_key&lt;/code&gt; argument.
+Details of how to launch automatic optimization can be found in the &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_relay_cuda.html&quot;&gt;AutoTVM tutorial&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h1 id=&quot;general-workflow&quot;&gt;General Workflow&lt;/h1&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/cuda-quantized/workflow.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 3. Workflow of running quantized models &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM provides an easy workflow to quantize trained models from other frameworks, automatically optimize operators (with AutoTVM), and deploy to different devices.&lt;/p&gt;
+
+&lt;p&gt;First, we use the Relay frontend to import existing models. Here we use an MXNet model with &lt;code class=&quot;highlighter-rouge&quot;&gt;(1, 3, 224, 224)&lt;/code&gt; input shape as an example.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg_params&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;aux_params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&l [...]
+&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Next, we use the relay quantization API to convert it to a quantized model.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Then, we use AutoTVM to extract tuning tasks for the operators in the model and perform automatic optimization. The &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_relay_cuda.html&quot;&gt;AutoTVM tutorial&lt;/a&gt; provides an example for this.&lt;/p&gt;
+
+&lt;p&gt;Finally, we build the model and run inference in the quantized mode.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;opt_level&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt [...]
+    &lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;lib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;s [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;The result of &lt;code class=&quot;highlighter-rouge&quot;&gt;relay.build&lt;/code&gt; is a deployable library.
+We can either run inference &lt;a href=&quot;https://docs.tvm.ai/tutorials/frontend/from_mxnet.html#execute-the-portable-graph-on-tvm&quot;&gt;on the GPU&lt;/a&gt; directly or deploy &lt;a href=&quot;https://docs.tvm.ai/tutorials/frontend/deploy_model_on_rasp.html#deploy-the-model-remotely-by-rpc&quot;&gt;on the remote devices&lt;/a&gt; via RPC.&lt;/p&gt;
+
+&lt;h1 id=&quot;benchmark&quot;&gt;Benchmark&lt;/h1&gt;
+&lt;p&gt;To verify the performance of the quantized operators in TVM, we benchmark the performance of several popular network models including VGG-19, ResNet-50 and Inception V3.
+We also benchmark on DRN-C-26, ResNeXt-50, and DCN-ResNet-101 from &lt;a href=&quot;https://github.com/msracver/Deformable-ConvNets&quot;&gt;Deformable ConvNets&lt;/a&gt; to show the performance of emerging models, which contains less conventional operators such as dilated convolutions, group convolutions and deformable convolutions.
+We choose NVIDIA TensorRT as our baseline.
+The result of MXNet 1.4 + cuDNN 7.3 in float32 mode is reported to show the speed up of quantization.
+The experiments are conducted on NVIDIA GTX 1080.
+We report the inference time per image when running in batch size = 1 and 16.&lt;/p&gt;
+
+&lt;p&gt;As shown in the Figure 1, TVM achieves up to 8x speedup using quantization.
+In standard CNN models such as VGG and ResNet, TVM achieves parity with the state-of-the-art results from TensorRT.&lt;/p&gt;
+
+&lt;p&gt;When benchmarking emerging models, TVM achieves impressive results.
+We obtain significant performance gains on ResNeXt and DCN-ResNet-101.
+Results of DCN-ResNet-101 of TensorRT are not available because there is no official implementation of the deformable convolution.
+We show that automatic optimization in TVM makes it easy and flexible to support and optimize emerging workloads.&lt;/p&gt;
+
+&lt;h1 id=&quot;show-me-the-code&quot;&gt;Show Me the Code&lt;/h1&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/vinx13/tvm-cuda-int8-benchmark&quot;&gt;Benchmark&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/conv2d_int8.py&quot;&gt;CUDA int8 conv2d&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/group_conv2d_nchw.py&quot;&gt;CUDA int8 group conv2d&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/dense.py&quot;&gt;CUDA int8 dense&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/tensor_intrin.py&quot;&gt;Tensor intrinsics declaration&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;bio--acknowledgement&quot;&gt;Bio &amp;amp; Acknowledgement&lt;/h1&gt;
+&lt;p&gt;&lt;a href=&quot;https://wuwei.io/&quot;&gt;Wuwei Lin&lt;/a&gt; is an undergraduate student at SJTU. He is currently an intern at TuSimple. The author has many thanks to &lt;a href=&quot;https://homes.cs.washington.edu/~tqchen/&quot;&gt;Tianqi Chen&lt;/a&gt; and &lt;a href=&quot;https://homes.cs.washington.edu/~eqy/&quot;&gt;Eddie Yan&lt;/a&gt; for their reviews.&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>TVM Deep Learning Compiler Joins Apache Software Foundation</title>
+   <link href="https://tvm.apache.org/2019/03/18/tvm-apache-announcement"/>
+   <updated>2019-03-18T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2019/03/18/tvm-apache-announcement</id>
+   <content type="html">&lt;p&gt;There is an increasing need to bring machine learning to a wide diversity of hardware devices. Current frameworks rely on vendor-specific operator libraries and optimize for a narrow range of server-class GPUs. Deploying workloads to new platforms – such as mobile phones, embedded devices, and accelerators (e.g., FPGAs, ASICs) – requires significant manual effort.&lt;/p&gt;
+
+&lt;p&gt;TVM is an open source deep learning compiler stack that closes the gap between the productivity-focused deep learning frameworks, and the performance- or efficiency-oriented hardware backends. Today, we are glad to announce that the TVM community has decided to move on to Apache incubator, and becomes an Apache(incubating) project.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/main/tvm-stack.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM stack began as a research project at the &lt;a href=&quot;https://sampl.cs.washington.edu/&quot;&gt;SAMPL group&lt;/a&gt; of Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington. The project uses the loop-level IR and several optimizations from the &lt;a href=&quot;http://halide-lang.org/&quot;&gt;Halide project&lt;/a&gt;, in addition to &lt;a href=&quot;https://tvm.ai/about&quot;&gt;a full deep learning compiler stack&lt;/a&gt; to support [...]
+
+&lt;p&gt;Since its introduction, the project was driven by an open source community involving multiple industry and academic institutions. Currently, the TVM stack includes a high-level differentiable programming IR for high-level optimization, a machine learning driven program optimizer and VTA – a fully open sourced deep learning accelerator. The community brings innovations from machine learning, compiler systems, programming languages, and computer architecture to build a full-stack  [...]
+
+&lt;p&gt;Besides the technical innovations, the community adopts an open, welcoming and neutral policy. The project is run by committers who are elected purely based on their merit of the contributions to the project. Besides the contributors from UW SAMPL, the community now has nearly 200 contributors that come from Amazon Web Services (AWS), Qualcomm, Facebook, Google, Huawei, AMD, Microsoft, Cornell University, University of California, Berkeley, and more.        The community success [...]
+
+&lt;p&gt;We would like to take this chance to thank the Allen School for supporting the SAMPL team that gave birth to the TVM project. We would also like to thank the Halide project which provided the basis for TVM’s loop-level IR and initial code generation. We would like to thank our Apache incubator mentors for introducing the project to Apache and providing useful guidance. Finally, we would like to thank the TVM community and all of the organizations, as listed above, that supported [...]
+
+&lt;p&gt;See also the &lt;a href=&quot;https://news.cs.washington.edu/2019/03/18/allen-schools-tvm-deep-learning-compiler-framework-transitions-to-apache/&quot;&gt;Allen School news about the transition here&lt;/a&gt;, &lt;a href=&quot;https://sampl.cs.washington.edu/tvmconf/#about-tvmconf&quot;&gt;TVM conference program slides and recordings&lt;/a&gt;, and &lt;a href=&quot;https://docs.tvm.ai/contribute/community.html&quot;&gt;our community guideline here&lt;/a&gt;. Follow us on Twitter [...]
+</content>
+ </entry>
+ 
+ <entry>
+   <title>TVM Golang Runtime for Deep Learning Deployment</title>
+   <link href="https://tvm.apache.org/2019/01/19/Golang"/>
+   <updated>2019-01-19T00:00:00-08:00</updated>
+   <id>https://tvm.apache.org/2019/01/19/Golang</id>
+   <content type="html">&lt;h2 id=&quot;introduction&quot;&gt;Introduction&lt;/h2&gt;
+
+&lt;p&gt;TVM is an open deep learning compiler stack to compile various deep learning models from different
+frameworks to CPU, GPU or specialized accelerators.  TVM supports model compilation from a wide range
+of front ends like Tensorflow, Onnx, Keras, Mxnet, Darknet, CoreML and Caffe2. TVM compiled modules
+can be deployed on backends like LLVM (Javascript or WASM, AMD GPU, ARM or X86), NVidia GPU (CUDA),
+OpenCL and Metal.&lt;/p&gt;
+
+&lt;p&gt;TVM supports runtime bindings for programming languages like Javascript, Java, Python, C++… and now Golang.
+With a wide range of frontend, backend and runtime bindings, TVM enables developers to integrate and
+deploy deep learning models from a variety of frameworks to a choice of hardware via many programming languages.&lt;/p&gt;
+
+&lt;p&gt;The TVM import and compilation process generates a graph JSON, a module and a params. Any application that
+integrates the TVM runtime can load these compiled modules and perform inference. A detailed tutorial of module
+import and compilation using TVM can be found at &lt;a href=&quot;https://docs.tvm.ai/tutorials/&quot;&gt;tutorials&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;TVM now supports deploying compiled modules through Golang. Golang applications can make use of this
+to deploy the deep learning models through TVM. The scope of this blog is the introduction of &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package,
+the package build process and a sample application using &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; to load a compiled module and perform inference.&lt;/p&gt;
+
+&lt;h2 id=&quot;package&quot;&gt;Package&lt;/h2&gt;
+
+&lt;p&gt;The golang package &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; is built on top of TVM’s C runtime interface. The API in this package
+abstracts the native C types and provides Golang compatible types. The package source can be found
+at &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/golang&quot;&gt;gotvm&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;This package leverages golang’s interface, slices, function closures and implicitly handles the
+necessary conversions across API calls.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/golang/TVM-Golang-Blog.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Golang Interface over TVM Runtime &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;how-to&quot;&gt;How to&lt;/h2&gt;
+
+&lt;p&gt;As shown in the below diagram &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; enables golang applications to integrate deep learning models
+from various frameworks without the hassle of understanding each framework related interface API.
+Developers can make use of TVM to import and compile deep learning models and generate TVM artifacts.
+&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides golang friendly API to load, configure, feed input and get output.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/golang/TVM-Golang-Flow.png&quot; alt=&quot;image&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Import, Compile, Integrate and Deploy&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM &lt;a href=&quot;https://docs.tvm.ai/tutorials/#compile-deep-learning-models&quot;&gt;Compile Deep Learning Models&lt;/a&gt; tutorials
+are available to compile models from all frameworks supported by the TVM frontend. This compilation process
+generates the artifacts required to integrate and deploy the model on a target.&lt;/p&gt;
+
+&lt;h2 id=&quot;api&quot;&gt;API&lt;/h2&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides a handful of datatypes and API functions to initialize, load and infer
+from a golang application. Like any other golang package we just need to import &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package here.&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Module : The Module API can be used to load a TVM compiled module into TVM runtime and access any functions.&lt;/li&gt;
+  &lt;li&gt;Value : The Value API provides helper functions to set arguments or get return values in golang types like basic types or slices.&lt;/li&gt;
+  &lt;li&gt;Function : The Function API is useful for getting handles to functions and invoking them.&lt;/li&gt;
+  &lt;li&gt;Array : The Array API is useful for setting and getting Tensor data via golang slice.&lt;/li&gt;
+  &lt;li&gt;Context : The Context API contains helper functions to build backend context handles.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;example&quot;&gt;Example&lt;/h2&gt;
+
+&lt;p&gt;A simple example with inline documentation of loading a compiled module and performing inference is shown below.
+For simplicity the error handling is ignored here, but is important in real applications.&lt;/p&gt;
+
+&lt;div class=&quot;language-cpp highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;
+&lt;span class=&quot;n&quot;&gt;package&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;main&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;// Import compiled gotvm package.&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;s&quot;&gt;&quot;./gotvm&quot;&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;// Some constants for TVM compiled model paths.&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;// modLib : Is the compiled library exported out of compilation.&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;// modJson : TVM graph JSON.&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;// modParams : Exported params out of TVM compilation process.&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;const&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modLib&lt;/span&gt;    &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;./libdeploy.so&quot;&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modJSON&lt;/span&gt;   &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;./deploy.json&quot;&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modParams&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;./deploy.params&quot;&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;// main&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;main&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+    &lt;span class=&quot;c1&quot;&gt;// Some util API to query underlying TVM and DLPack version information.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fmt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Printf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;TVM Version   : v%v&lt;/span&gt;&lt;span class=&quot;se&quot;&gt;\n&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
+    &lt;span class=&quot;n&quot;&gt;fmt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Printf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;DLPACK Version: v%v&lt;/span&gt;&lt;span class=&quot;se&quot;&gt;\n\n&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+
+    &lt;span class=&quot;c1&quot;&gt;// Import tvm module (so).&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;LoadModuleFromFile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;modLib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Load module on tvm runtime - call tvm.graph_runtime.create&lt;/span&gt;
+    &lt;span class=&quot;c1&quot;&gt;// with module and graph JSON.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ioutil&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ReadFile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;modJSON&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;jsonStr&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;string&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetGlobalFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;tvm.graph_runtime.create&quot;&lt;/span&gt;&lt;span cla [...]
+    &lt;span class=&quot;n&quot;&gt;graphrt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;jsonStr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt [...]
+    &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphrt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;AsModule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+
+
+    &lt;span class=&quot;c1&quot;&gt;// Allocate input &amp;amp; output arrays and fill some data for input.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;tshapeIn&lt;/span&gt;  &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int64&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;224&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;224&lt;/span&gt;&lt [...]
+    &lt;span class=&quot;n&quot;&gt;tshapeOut&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int64&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1001&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;inX&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tshapeIn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+    &lt;span class=&quot;n&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tshapeOut&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;make&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;244&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;244&lt;/span&gt [...]
+    &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Seed&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;10&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Shuffle&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;len&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;sp [...]
+                                               &lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;
+                                               &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;})&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;inX&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;CopyFrom&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Load params&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ioutil&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ReadFile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;modParams&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;load_params&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+
+    &lt;span class=&quot;c1&quot;&gt;// Set module input&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;set_input&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;input&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inX&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Run or Execute the graph&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;run&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt; [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Get output from runtime.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;get_output&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int64&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Access output tensor data.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;outIntf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;AsSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;outSlice&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;outIntf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.([]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// outSlice here holds flattened output data as a golang slice.&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; extends the TVM packed function system to support golang function closures as packed functions.
+&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/golang/sample&quot;&gt;Examples&lt;/a&gt; available to register golang
+closure as TVM packed function and invoke the same across programming language barriers.&lt;/p&gt;
+
+&lt;h2 id=&quot;show-me-the-code&quot;&gt;Show me the code&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/golang/src&quot;&gt;Package Source&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/golang/sample&quot;&gt;Examples&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;[1] &lt;a href=&quot;https://golang.org&quot;&gt;Go Programming Lang&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[2] &lt;a href=&quot;https://blog.golang.org/godoc-documenting-go-code&quot;&gt;Go Documentation Guide Lines&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[3] &lt;a href=&quot;https://golang.org/pkg/testing&quot;&gt;Go Testcase Framework&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[4] &lt;a href=&quot;https://golang.org/cmd/cgo&quot;&gt;Go CFFI&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[5] &lt;a href=&quot;https://blog.learngoprogramming.com/golang-variadic-funcs-how-to-patterns-369408f19085&quot;&gt;Go Variadic Functions&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[6] &lt;a href=&quot;https://github.com/jdeng/gomxnet&quot;&gt;CFFI Ref&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[7] &lt;a href=&quot;https://golang.org/pkg/runtime/#SetFinalizer&quot;&gt;Go Finalizers&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Automating Generation of Low Precision Deep Learning Operators</title>
+   <link href="https://tvm.apache.org/2018/12/18/lowprecision-conv"/>
+   <updated>2018-12-18T00:00:00-08:00</updated>
+   <id>https://tvm.apache.org/2018/12/18/lowprecision-conv</id>
+   <content type="html">&lt;p&gt;As deep learning models grow larger and more complex, deploying them on low powered phone and IoT
+devices becomes challenging because of their limited compute and energy budgets. A  recent  trend
+ in  deep  learning  is  the  use  of  extremely  quantized  models  that operate  on  inputs  and
+ weights  of  a  few  bits, with networks like XNOR-Net, DoReFa-Net, and HWGQ-Net making steady
+progress improving accuracy.&lt;/p&gt;
+
+&lt;p&gt;An example of a low precision graph snippet is below. The low precision convolution takes in
+quantized data and bitpacks into the proper data layout for an efficient bitserial convolution.
+The output is in a higher precision and traditional deep learning layers such as batch normalization and ReLu are applied to it, before being re-quantized and sent through another low precision operator.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/workflow.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Low precision convolution pipeline.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Theoretically,  low  precision operators use less operations than
+floating point operators, leading many to believe they can achieve up tremendous speedups.
+However, deep  learning frameworks  leverage  decades  of  engineering  work  through  low  level
+BLAS  and LAPACK libraries that are incredibly well optimized, and CPUs include intrinsic
+instructions to accelerate these tasks.  In  practice,  it  is  not  simple  to  develop low-level
+operators such as convolutions  that  are competitive  with  8-bit  quantized  or  even floating
+point operators.
+In  this  post  we  introduce  our  approach to automatically generating optimized
+low  precision  convolutions for  CPUs. We declare our low precision operators so that they compute
+on efficiently stored low precision inputs, and describe a schedule that describes a search space
+of implementation parameters. We rely on AutoTVM to quickly search the space and find optimized
+parameters for the particular convolution, precision, and backend.&lt;/p&gt;
+
+&lt;h2 id=&quot;bitserial-computation-background&quot;&gt;Bitserial Computation Background&lt;/h2&gt;
+
+&lt;p&gt;The  core  of  low  precision  models  is  the bitserial dot product that enables convolution and
+dense operators to be computed using only bitwise operations and popcount.
+ Typically, a dot product is computed by element wise multiplication of two vectors followed by
+ summing all the elements, like the simple example below. If all the data is binary, the input
+ vectors can be packed into single integer, and the dot product can be computed by  bitwise-anding
+ the packed inputs and counting the number of 1’s in the result using popcount.
+Note: Depending how the input data is quantized, bitwise-xnor may be used instead of bitwise-and.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/binary-dotproduct.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Binary dot product.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Arbitrary precision dot products can be computed in this fashion by first separating input data
+into bitplanes. Once in this representation we can compute dotproduct by summing weighted binary
+dot products between the bitplanes of A and B. The number of binary dotproducts grows with the
+product of A and B’s precision, so this method is only practical for very low precision data.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/bitserial-dotproduct.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Bitserial dot product.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;defining-operators-in-tvm&quot;&gt;Defining Operators in TVM&lt;/h2&gt;
+&lt;p&gt;Before the computation, input data needs to be bitpacked so that the bitplanes of the input data
+can be accessed and are packed into a supported datatype such as a uint8 or uint32. We provide
+a flexible bitpacking operator that takes arbitrary size input tensors and returns a bitpacked
+tensor where the user specifies which axis the bitplanes should be.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/bitpack.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Different bitpacked layouts.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Once in this bitpacked format the low precision  convolution can be computed bitserially.
+For this demo, that data is packed along the input channel and the bitplanes are added to the
+innermost axis, and the data is packed into 32-bit integers. The bitserial convolution is computed
+similar to a normal convolution, but the bitwise-and (&amp;amp;) replaces multiplication, and we use
+popcount to accumulate values in the packed data. The bitplane axes become additional reduction axes
+and compute the binary dot products between different bitplanes of the input and kernel.
+Finally, the output is computed in an unpacked format and in higher precision.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;Input_bitpacked&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bitpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;acti [...]
+&lt;span class=&quot;n&quot;&gt;Weights_bitpacked&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bitpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pack_axis&lt;/span&gt;&lt;span class=&quot;o&quot;& [...]
+&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span& [...]
+&lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt [...]
+
+&lt;span class=&quot;n&quot;&gt;stride_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_left&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_down&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_right&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_pad_tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;( [...]
+
+&lt;span class=&quot;c1&quot;&gt;# Computing the output shape
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;out_channel&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_filter&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;simplify&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;in_height&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+ [...]
+&lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;simplify&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;in_width&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_left&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+& [...]
+&lt;span class=&quot;n&quot;&gt;pad_before&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_left&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;pad_after&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_down&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_right&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&l [...]
+&lt;span class=&quot;n&quot;&gt;Input_padded&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Input_bitpacked&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_before&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_after&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+
+&lt;span class=&quot;c1&quot;&gt;# Treat the bitplane axes like additional reduction axes
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&l [...]
+&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
+&lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
+&lt;span class=&quot;n&quot;&gt;ib&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt [...]
+&lt;span class=&quot;n&quot;&gt;wb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &l [...]
+
+
+&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; [...]
+             &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;popcount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+               &lt;span class=&quot;n&quot;&gt;Input_padded&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt; [...]
+               &lt;span class=&quot;n&quot;&gt;Weights_bitpacked&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ff&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
+               &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wb&lt;/span&gt;&lt;spa [...]
+
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;In our schedule we apply common optimizations like vectorization and memory tiling to provide better
+memory locality and take advantage of SIMD units. Some of these optimizations such as tiling,
+require parameters that need to be tuned to for the specific microarchitecture. We expose these
+parameters as knobs to TVM and use AutoTVM to automatically tune all the parameters simultaneously.&lt;/p&gt;
+
+&lt;p&gt;Finally, we can craft small microkernels to replace the innermost loop(s) of computation and schedule
+ them using TVM’s tensorize primitive. Since, compilers often produce suboptimal code, people can
+ often write short assembly sequences that are more efficient. These microkernels often take advantage
+ of new intrinsics that are being introduced to help accelerate deep learning workloads and use
+ them clever ways to improve memory accesses or reduce the number instructions required.&lt;/p&gt;
+
+&lt;h2 id=&quot;results&quot;&gt;Results&lt;/h2&gt;
+
+&lt;h3 id=&quot;raspberry-pi&quot;&gt;Raspberry Pi&lt;/h3&gt;
+&lt;p&gt;Convolution speedups on Raspberry Pi 3B compared to 16-bit integer TVM implementation.
+Workload are convolution layers from ResNet18.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/rasp-conv.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Speedup of low precision convolutions on a Raspberry Pi compared to 16-bit TVM implementation.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;2-bit activation, 1-bit weight convolution speedups on Raspberry Pi 3B compared to hand optimized implementation from &lt;a href=&quot;https://arxiv.org/pdf/1712.02427.pdf&quot;&gt;High performance ultra-low-precision convolutions
+on mobile devices.&lt;/a&gt;.
+Workload are convolution layers from ResNet18.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/rasp-conv-2.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Speedup of 2-bit weight 1-bit activation Raspberry Pi convolutions against a hand optimized implementation.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;x86&quot;&gt;x86&lt;/h3&gt;
+
+&lt;p&gt;Convolution speedups on x86 compared to a 32-bit floating point TVM implementation.
+Note: x86 doesn’t support a vectorized popcount for this microarchitecture, so speedups are lower.&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/x86-conv.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Speedup of x86 low precision convolutions compared to a 32-bit floating point TVM implementation.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;show-me-the-code&quot;&gt;Show me the code&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/nn/bitserial_conv2d.py&quot;&gt;TOPI bitserial convolution&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/arm_cpu/bitserial_conv2d.py&quot;&gt;TOPI ARM cpu bitserial convolution&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;[1] &lt;a href=&quot;https://arxiv.org/abs/1810.11066&quot;&gt;Automating Generation of Low Precision Deep Learning Operators&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[2] &lt;a href=&quot;https://arxiv.org/abs/1603.05279&quot;&gt;XNOR-Net&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[3] &lt;a href=&quot;https://arxiv.org/abs/1702.00953&quot;&gt;HWGQ&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[4] &lt;a href=&quot;https://arxiv.org/abs/1606.06160&quot;&gt;DoReFa&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Efficient Privacy-Preserving ML Using TVM</title>
+   <link href="https://tvm.apache.org/2018/10/09/ml-in-tees"/>
+   <updated>2018-10-09T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2018/10/09/ml-in-tees</id>
+   <content type="html">&lt;p&gt;This post describes Myelin, a framework for privacy-preserving machine learning in trusted hardware enclaves, and how TVM makes Myelin fast.
+The key idea is that TVM, unlike other popular ML frameworks, compiles models into lightweight, optimized, and dependency-free libraries which can fit into resource constrained enclaves.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/tvmfits.png&quot; alt=&quot;TVM fits in enclaves&quot; style=&quot;width: 80vw; max-width: 600px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Give creating a privacy-preserving ML model a try! Check out the &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/sgx&quot;&gt;example code&lt;/a&gt; available in the TVM repo.&lt;/p&gt;
+
+&lt;h1 id=&quot;motivation-privacy-preserving-ml&quot;&gt;Motivation: Privacy-Preserving ML&lt;/h1&gt;
+
+&lt;p&gt;Machine learning models benefit from large and diverse datasets.
+Unfortunately, using such datasets often requires trusting a centralized data aggregator or computation provider.
+For sensitive applications like healthcare and finance this is undesirable as it could compromise patient privacy or divulge trade secrets.
+Recent advances in secure and privacy-preserving computation, including &lt;em&gt;trusted execution environments&lt;/em&gt; and &lt;em&gt;differential privacy&lt;/em&gt;, offer a way for mutually distrusting parties to efficiently train a machine learning model without compromising the training data.
+We use TVM to make privacy-preserving ML framework fast.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/sgx.png&quot; alt=&quot;Myelin workflow&quot; style=&quot;width: 80vw; max-width: 600px&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;use-cases&quot;&gt;Use Cases&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;private MLaaS&lt;/strong&gt;: a cloud provider runs their architecture on your data. You get the model outputs, your data stays private, and the cloud provider knows that you can’t steal model.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;trustworthy ML competitions&lt;/strong&gt;: you train a model on contest data. The contest organizer sends private test data to your model and gets verifiable report of accuracy. Your model stays safe until the organizer decides to purchase it. Other participants can‘t cheat by training on test data.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;training on shared private data&lt;/strong&gt;: you (a researcher) want to train a model on several hospitals’ data. Directly sharing is too complicated. Instead, have a “trusted third party” train a privacy-preserving model.&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;http://www.vldb.org/pvldb/vol11/p2086-hynes.pdf&quot;&gt;&lt;strong&gt;ML on the Blockchain&lt;/strong&gt;&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;background&quot;&gt;Background&lt;/h1&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/dpnn.png&quot; alt=&quot;sketch of DP deep learning in a TEE&quot; style=&quot;width: 80vw; max-width: 400px&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;trusted-execution-environments&quot;&gt;Trusted Execution Environments&lt;/h2&gt;
+
+&lt;p&gt;A &lt;a href=&quot;https://en.wikipedia.org/wiki/Trusted_Computing#Remote_attestation&quot;&gt;trusted execution environment&lt;/a&gt; (TEE) essentially allows a remote user to provably run code on another person’s machine without revealing the computation to the hardware provider.&lt;/p&gt;
+
+&lt;p&gt;More technically, the TEE provides a secure &lt;em&gt;enclave&lt;/em&gt; of isolated/encrypted memory and CPU registers; also a trusted source of randomness.
+The TEE can also send a signed attestation of the code that’s loaded so that the remote user can verify that the enclave has been correctly loaded.
+This process, known as remote attestation, can be used to establish a secure communication channel into the enclave .
+The remote user can then provision it with secrets like private keys, model parameters, and training data.&lt;/p&gt;
+
+&lt;p&gt;Compared to pure crypto methods like &lt;a href=&quot;https://en.wikipedia.org/wiki/Garbled_circuit&quot;&gt;secure multi-parity computation (MPC)&lt;/a&gt; and &lt;a href=&quot;https://en.wikipedia.org/wiki/Homomorphic_encryption#Fully_homomorphic_encryption&quot;&gt;fully-homomorphic encryption (FHE)&lt;/a&gt;, TEEs are several orders of magnitude faster and support general-purpose computation (i.e. not just arithmetic operations).
+Perhaps the only drawbacks are the additional trust assumptions in the hardware root of trust (a key burned into the processor) and loaded software.&lt;/p&gt;
+
+&lt;p&gt;Trust assumptions notwithstanding, TEE technology is becoming increasingly widespread and is playing a major role in practical privacy-preservation.
+In fact, general-purpose TEEs already exist in commodity hardware like &lt;a href=&quot;https://software.intel.com/en-us/sgx&quot;&gt;Intel SGX&lt;/a&gt; and &lt;a href=&quot;https://genode.org/documentation/articles/trustzone&quot;&gt;ARM TrustZone&lt;/a&gt;.
+Additionally, the fully-open source &lt;a href=&quot;https://keystone-enclave.org&quot;&gt;Keystone enclave&lt;/a&gt; is on the way.&lt;/p&gt;
+
+&lt;h2 id=&quot;differential-privacy&quot;&gt;Differential Privacy&lt;/h2&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/dp.png&quot; alt=&quot;DP as false positive/negative&quot; style=&quot;width: 80vw; max-width: 500px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://en.wikipedia.org/wiki/Differential_Privacy#Randomized_Response&quot;&gt;Differential privacy (DP)&lt;/a&gt; provides a formal guarantee that models trained on similar datasets are indistinguishable
+Informally, a user’s privacy is not compromised by choosing to contribute data to a model.&lt;/p&gt;
+
+&lt;p&gt;In other words, given the output of an algorithm on two datasets which differ in only a single record, differential privacy upper bounds the probability that an adversary can determine which dataset.
+An algorithm may be made DP using a mechanism which adds noise to the algorithm’s output.
+The amount of noise is calibrated on how much the output depends on any particular inputs.
+If you’re familiar with hypothesis testing, if outcomes A and B each have probability 0.5, applying a DP mechanism is like convolving with a probability distribution: the privacy is in the false positive and false negative rates.
+Since deep learning models tend to generalize well, the amount of noise is often less than might be expected.&lt;/p&gt;
+
+&lt;p&gt;Running a DP training algorithm in a TEE ensures that the DP mechanism is faithfully applied.&lt;/p&gt;
+
+&lt;h1 id=&quot;efficient-privacy-preserving-ml-using-tvm&quot;&gt;Efficient Privacy-Preserving ML Using TVM&lt;/h1&gt;
+
+&lt;p&gt;One of the primary challenges of working with a TEE is that the code running within does not have access to the untrusted OS.
+This means that the trusted software cannot  create threads or perform I.O
+Practically speaking, the result is that numerical libraries like OpenBLAS–much less frameworks like PyTorch and TensorFlow–cannot run directly in enclaves.&lt;/p&gt;
+
+&lt;p&gt;TEEs actually have a similar programming model to resource-constrained hardware accelerators.
+This is exactly what TVM is made for!
+In the privacy workflow, a user first defines an entire training graph in the high-level graph specification language.
+TVM them compiles the model and outputs a static library containing optimized numerical kernels which can easily be loaded into a TEE.
+Since the kernels are automatically generated and have strict bounds checking, they expose a low surface area of attack.
+They are supported by a lightweight memory-safe Rust runtime which also may easily be reviewed for safety and correctness.&lt;/p&gt;
+
+&lt;p&gt;Of course, safety is most useful when practically applicable.
+Fortunately, TVM modules in enclaves have comparable performance to native CPU-based training.
+By coordinating threads using the untrusted runtime, a single TVM enclave can fully utilize the resources of its host machine.
+Moreover, it’s not difficult to imagine a secure parameter server which orchestrates entire datacenters of enclave-enabled machines.&lt;/p&gt;
+
+&lt;p&gt;TVM also provides opportunities for more subtle optimization of privacy-preserving algorithms.
+Indeed, its fine-grained scheduling features allow speedups when using differential privacy.
+For instance, the tightest DP bounds may be obtained from clipping the gradients of each training example and adding noise to each [1].
+In autograd frameworks, this requires forwarding the model for each example in the minibatch (though only one backward pass is needed) [2].
+Using TVM, however, per-example gradient clipping is straightforward: instead of scheduling each weight update as a single reduction over both batch and feature dimensions, the reduction is split into two.
+The reduction over features is followed by clipping and noising, and then the final result is finally summed over examples to obtain the weight update.
+Thus, TVM allows applying differential privacy without introducing overhead greater than what is required by the technique.
+Also, if one really wants to get really fancy, it’s possible to fuse the clipping and noising operations and apply them in-place to further trim down latency and memory usage.&lt;/p&gt;
+
+&lt;p&gt;For benchmarks on realistic workloads, please refer to the tech report &lt;a href=&quot;https://arxiv.org/abs/1807.06689&quot;&gt;&lt;em&gt;Efficient Deep Learning on Multi-Source Private Data&lt;/em&gt;&lt;/a&gt;.
+And, of course, feel free go give the framework a spin in the &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/sgx&quot;&gt;TVM SGX example&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h1 id=&quot;conclusion&quot;&gt;Conclusion&lt;/h1&gt;
+
+&lt;p&gt;The next generation of learning systems will be ushered in by privacy.
+As TEE technology becomes better understood and more widely available, it makes sense to leverage it as a resource for privacy-preserving machine learning and analytics.
+TVM is well poised to facilitate development of this use case in both research and deployment.&lt;/p&gt;
+
+&lt;h1 id=&quot;bio--acknowledgement&quot;&gt;Bio &amp;amp; Acknowledgement&lt;/h1&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://github.com/nhynes&quot;&gt;Nick&lt;/a&gt; is a PhD student in Prof. Dawn Song’s lab at UC Berkeley.
+His research interest is in the general domain of ML on shared private data, but this is really just an excuse to mess with Rust, security monitors, hardware enclaves, and compilers like TVM.&lt;/p&gt;
+
+&lt;p&gt;Thanks to Tianqi Chen for the code reviews!&lt;/p&gt;
+
+&lt;h1 id=&quot;references&quot;&gt;References&lt;/h1&gt;
+
+&lt;p&gt;[1] &lt;a href=&quot;https://arxiv.org/abs/1607.00133&quot;&gt;Deep Learning with Differential Privacy&lt;/a&gt;&lt;br /&gt;
+[2] &lt;a href=&quot;https://arxiv.org/pdf/1510.01799v2.pdf&quot;&gt;Efficient Per-Example Gradient Computations&lt;/a&gt;&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Automatic Kernel Optimization for Deep Learning on All Hardware Platforms</title>
+   <link href="https://tvm.apache.org/2018/10/03/auto-opt-all"/>
+   <updated>2018-10-03T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2018/10/03/auto-opt-all</id>
+   <content type="html">&lt;p&gt;Optimizing the performance of deep neural network on a diverse range of hardware platforms is still a hard
+problem for AI developers. In terms of system support, we are facing a many-to-many problem here:
+deploying trained models from multiple frontends (e.g. Tensorflow, ONNX, MXNet) to multiple
+hardware platforms (e.g. CPU, GPU, Accelerators). The most performance critical part of
+this problem is obtaining high performance kernel implementations for growing model
+architectures and hardware platforms.&lt;/p&gt;
+
+&lt;p&gt;To address this challenge, TVM takes a full stack compiler approach.
+TVM combines code generation and automatic program optimization to generate kernels
+that are comparable to heavily hand-optimized libraries,
+obtaining state-of-the-art inference performance on hardware platforms including
+ARM CPUs, Intel CPUs, Mali GPUs, NVIIDA GPUs and AMD GPUs.&lt;/p&gt;
+
+&lt;p&gt;In this blog post, we show the workflow of automatic kernel optimization in TVM compiler stack and 
+benchmark results on several hardware platforms.&lt;/p&gt;
+
+&lt;h1 id=&quot;system-overview&quot;&gt;System Overview&lt;/h1&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/autotune-all/overview.png&quot; alt=&quot;image&quot; width=&quot;35%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 1. System Overview &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Kernel optimization in TVM is done in an iterative loop fashion.
+As shown in Figure 1, the automatic kernel optimization takes a neural network (typically in computational graph representation)
+from frontend frameworks as input, and generates kernels for all operators in this network.&lt;/p&gt;
+
+&lt;p&gt;The inner loop uses a scalable RPC runtime, machine learning based tuners and a tensor compiler.
+In each round of the loop, the tuner picks a batch of promising candidate kernel implementations from a large search space,
+and profile them on real hardware. Then the tuner gets the profiling results. These profiling results are used as training 
+data to fit a prediction model. After fitting the prediction model, the tuner picks the next promising candidates according to the predictions,
+and the loop continues. This way, we search for fast kernels iteratively.&lt;/p&gt;
+
+&lt;p&gt;The below figure compares traditional auto-tuning and AutoTVM. 
+The major difference is that AutoTVM is&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;Scalable&lt;/strong&gt; to heterogenous cluster of devices&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;Learning&lt;/strong&gt; to optimize tensor programs with a transferable machine learning cost model&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;You can refer to our paper[1] for more details.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/autotune-all/autotvm.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 2. Comparison of Traditional Auto-tuning and AutoTVM &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;begin-tuning&quot;&gt;Begin Tuning&lt;/h2&gt;
+&lt;p&gt;For demonstration, we run our optimization for resnet-18 on RK3399, an ARM development board.
+The detailed instructions are omitted due to the space limit of a blog post.
+Links to tutorials for ARM CPU, Mali GPU, NVIDIA GPU, AMD GPU are all available at the end of this blog.&lt;/p&gt;
+
+&lt;p&gt;First we get a pre-trained model from MXNet model zoo, and extract tuning tasks from it.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;mxnet.gluon.model_zoo.vision&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_model&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;block&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_model&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'resnet18_v1'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pretrained&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/ [...]
+&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nnvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;frontend&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt [...]
+
+&lt;span class=&quot;n&quot;&gt;tasks&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;autotvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;extract_from_graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;tune_tasks&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tasks&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tuning_option&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;There are 12 different conv2d layers in resnet-18, so we launch 12 tuning tasks.
+For each of them, the tuner makes several hundreds of trials and picks the best one.
+After finishing all tuning tasks, we compile the whole network and generate a single deployable minimal library.
+One sample output is&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Extract tasks...
+Tuning...
+[Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
+[Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
+[Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
+[Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
+[Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
+[Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
+[Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
+[Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
+[Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
+[Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
+[Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
+[Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
+Compile...
+Upload...
+Evaluate inference time cost...
+Mean inference time (std dev): 162.59 ms (0.06 ms)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;The tuning is especially helpful and worth a try if your model has some strange shapes or
+your hardware is customized, as hand-optimized static libraries cannot consider all situations.&lt;/p&gt;
+
+&lt;h1 id=&quot;benchmark-results&quot;&gt;Benchmark Results&lt;/h1&gt;
+&lt;p&gt;We pre-tuned some popular networks on our device cluster and released the following benchmark.
+Instructions for reproduction are at the end of this blog.&lt;/p&gt;
+
+&lt;p&gt;Comprehensively benchmarking TVM is easy since we have a unified runtime interface.
+However maintaining complete, up-to-date, and correct comparisons against all other platforms is not feasible
+without expert assistance from the developers of many other projects.
+So we put all our numbers in a table, and then provide an incomplete comparison with some other libraries.&lt;/p&gt;
+
+&lt;h2 id=&quot;comparison&quot;&gt;Comparison&lt;/h2&gt;
+&lt;p&gt;We validate the effectiveness of our automatic optimization stack by 
+comparing with heavily optimized traditional libraries on each platform.&lt;/p&gt;
+
+&lt;p&gt;We tested popular image classification networks on ImageNet (3x224x224) dataset with batch size = 1 and data type = float32.
+The reported numbers are time costs per image in milliseconds.&lt;/p&gt;
+
+&lt;h3 id=&quot;arm-cpu&quot;&gt;ARM CPU&lt;/h3&gt;
+
+&lt;p&gt;We choose &lt;a href=&quot;https://github.com/Tencent/ncnn&quot;&gt;NCNN&lt;/a&gt;, a widely used, hand-optimized kernel library as baseline.
+It makes extensive use of NEON assembly instructions. For example, the code base contains
+&lt;a href=&quot;https://github.com/Tencent/ncnn/blob/master/src/layer/arm/convolution_3x3.h&quot;&gt;13k lines of code&lt;/a&gt; for only 3x3 convolution layers.
+We reference the benchmark numbers in their project repository.
+As shown in the figure below, TVM outperforms it for all networks on Rasbperry Pi 3B.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/arm.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;mali-gpu&quot;&gt;Mali GPU&lt;/h3&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://github.com/ARM-software/ComputeLibrary&quot;&gt;ARM Compute Library&lt;/a&gt; is a vendor provided library that supports Mali GPU (OpenCL) well.
+According to the results, TVM provides stronger performance in ResNet and MobileNet due to advantages in convolutional layers.
+TVM lags behind a bit on vgg-16 because vgg-16 is an old and huge network and has several large dense layers.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/mali.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;nvidia-gpu&quot;&gt;NVIDIA GPU&lt;/h3&gt;
+
+&lt;p&gt;On NVIDIA GPU, &lt;a href=&quot;https://developer.nvidia.com/cudnn&quot;&gt;CuDNN&lt;/a&gt; and &lt;a href=&quot;https://developer.nvidia.com/tensorrt&quot;&gt;TensorRT&lt;/a&gt; are two vendor-provided libraries for training and inference respectively. Since we focus on inference,
+we run our benchmark in the unbatched setting. Another tensor compiler &lt;a href=&quot;https://github.com/plaidml/plaidml&quot;&gt;PlaidML&lt;/a&gt; is also reported as baseline
+as there is a previous benchmark of it compared against a pre-AutoTVM version of TVM.
+We reference its benchmark results from &lt;a href=&quot;https://github.com/plaidml/plaidbench&quot;&gt;PlaidBench&lt;/a&gt;.
+According to the results below, TVM achieves parity with TensorRT performance.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/nvidia.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;amd-gpu&quot;&gt;AMD GPU&lt;/h3&gt;
+
+&lt;p&gt;We also take a quick look at a AMD GPU. TVM supports OpenCL and &lt;a href=&quot;https://rocm.github.io/&quot;&gt;ROCm&lt;/a&gt; backend. We found ROCm is better since
+it is more specialized for AMD GPUs.
+&lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/MIOpen&quot;&gt;MIOpen&lt;/a&gt; is a vendor provided
+kernel library. TVM’s graph runtime can call MIOpen’s kernel implementations directly, so we report
+the baseline performance by using this integration.&lt;/p&gt;
+
+&lt;p&gt;We didn’t do any specific optimization for AMD GPU. All computation definition and schedule code for NVIDIA GPU is directly reused.
+As a result, TVM is a little bit slower then MIOpen in most cases.
+We believe there is still room for improvement.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/amd.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;all-our-results&quot;&gt;All Our Results&lt;/h2&gt;
+&lt;p&gt;We tested the following networks on ImageNet (3x224x224) dataset with batch size = 1 and data type = float32.
+The reported numbers are time costs per image in milliseconds.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;densenet121&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;inception v3&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;mobilenet&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;mobilenet v2&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;resnet18&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;resnet50&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;squeezenet v1.0&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;squeezenet v1.1&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;vgg16&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;vgg19&lt;/strong&gt;&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;ARM CPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Huawei P20 Pro&lt;/td&gt;
+      &lt;td&gt;181.4&lt;/td&gt;
+      &lt;td&gt;439.9&lt;/td&gt;
+      &lt;td&gt;41.1&lt;/td&gt;
+      &lt;td&gt;34.5&lt;/td&gt;
+      &lt;td&gt;76.5&lt;/td&gt;
+      &lt;td&gt;208.2&lt;/td&gt;
+      &lt;td&gt;51.8&lt;/td&gt;
+      &lt;td&gt;25.7&lt;/td&gt;
+      &lt;td&gt;480.6&lt;/td&gt;
+      &lt;td&gt;627.0&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Google Pixel2&lt;/td&gt;
+      &lt;td&gt;162.2&lt;/td&gt;
+      &lt;td&gt;433.5&lt;/td&gt;
+      &lt;td&gt;39.5&lt;/td&gt;
+      &lt;td&gt;30.1&lt;/td&gt;
+      &lt;td&gt;61.1&lt;/td&gt;
+      &lt;td&gt;181.3&lt;/td&gt;
+      &lt;td&gt;47.3&lt;/td&gt;
+      &lt;td&gt;23.2&lt;/td&gt;
+      &lt;td&gt;391.1&lt;/td&gt;
+      &lt;td&gt;487.7&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Firefly RK3399&lt;/td&gt;
+      &lt;td&gt;335.9&lt;/td&gt;
+      &lt;td&gt;1285.9&lt;/td&gt;
+      &lt;td&gt;78.6&lt;/td&gt;
+      &lt;td&gt;66.7&lt;/td&gt;
+      &lt;td&gt;161.2&lt;/td&gt;
+      &lt;td&gt;403.8&lt;/td&gt;
+      &lt;td&gt;94.6&lt;/td&gt;
+      &lt;td&gt;48.5&lt;/td&gt;
+      &lt;td&gt;902.9&lt;/td&gt;
+      &lt;td&gt;1090.1&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Raspberry Pi 3B&lt;/td&gt;
+      &lt;td&gt;609.5&lt;/td&gt;
+      &lt;td&gt;2070.4&lt;/td&gt;
+      &lt;td&gt;122.2&lt;/td&gt;
+      &lt;td&gt;103.7&lt;/td&gt;
+      &lt;td&gt;322.5&lt;/td&gt;
+      &lt;td&gt;725.8&lt;/td&gt;
+      &lt;td&gt;185.1&lt;/td&gt;
+      &lt;td&gt;94.1&lt;/td&gt;
+      &lt;td&gt;1759.6&lt;/td&gt;
+      &lt;td&gt;2118.6&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Xilinx PYNQ&lt;/td&gt;
+      &lt;td&gt;2888.3&lt;/td&gt;
+      &lt;td&gt;9709.1&lt;/td&gt;
+      &lt;td&gt;723.5&lt;/td&gt;
+      &lt;td&gt;514.3&lt;/td&gt;
+      &lt;td&gt;1234.6&lt;/td&gt;
+      &lt;td&gt;3580.5&lt;/td&gt;
+      &lt;td&gt;909.9&lt;/td&gt;
+      &lt;td&gt;477.3&lt;/td&gt;
+      &lt;td&gt;&lt;sup&gt;-(Note 1)&lt;/sup&gt;&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;Mali GPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Mali-T860&lt;/td&gt;
+      &lt;td&gt;410.9&lt;/td&gt;
+      &lt;td&gt;783.1&lt;/td&gt;
+      &lt;td&gt;75.4&lt;/td&gt;
+      &lt;td&gt;70.8&lt;/td&gt;
+      &lt;td&gt;128.6&lt;/td&gt;
+      &lt;td&gt;352.9&lt;/td&gt;
+      &lt;td&gt;106.2&lt;/td&gt;
+      &lt;td&gt;58.0&lt;/td&gt;
+      &lt;td&gt;679.5&lt;/td&gt;
+      &lt;td&gt;805.3&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;NVIDIA GPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GTX 1080 Ti&lt;/td&gt;
+      &lt;td&gt;3.6&lt;/td&gt;
+      &lt;td&gt;5.8&lt;/td&gt;
+      &lt;td&gt;0.6&lt;/td&gt;
+      &lt;td&gt;- &lt;sup&gt;(Note 2) &lt;/sup&gt;&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;2.7&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;4.0&lt;/td&gt;
+      &lt;td&gt;4.6&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GTX TITAN X&lt;/td&gt;
+      &lt;td&gt;5.8&lt;/td&gt;
+      &lt;td&gt;9.7&lt;/td&gt;
+      &lt;td&gt;1.0&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;4.3&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;6.4&lt;/td&gt;
+      &lt;td&gt;7.5&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;AMD GPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;AMD Vega FE&lt;/td&gt;
+      &lt;td&gt;5.7&lt;/td&gt;
+      &lt;td&gt;8.8&lt;/td&gt;
+      &lt;td&gt;1.0&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;4.5&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;5.9&lt;/td&gt;
+      &lt;td&gt;7.0&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Note 1: Out of memory on this board.&lt;/li&gt;
+  &lt;li&gt;Note 2: We didn’t tune some small networks on GPU due to time constraints.
+When profiling data is not available, TVM can use fallback code generation. 
+But competitive performance is not guaranteed in this scenario.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;conclusion&quot;&gt;Conclusion&lt;/h1&gt;
+&lt;p&gt;With an expressive code generator and an efficient search algorithm, we are able to
+generate kernels that are comparable to heavily hand-optimized ones.
+Since programmer time is expensive and machine time is getting cheaper,
+we believe automatic optimization with real hardware and data in the loop will be the standard workflow
+for inference deployment. TVM just provides such a solution.&lt;/p&gt;
+
+&lt;h2 id=&quot;links&quot;&gt;Links&lt;/h2&gt;
+&lt;p&gt;[1] benchmark: &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/benchmark&quot;&gt;https://github.com/dmlc/tvm/tree/master/apps/benchmark&lt;/a&gt;&lt;br /&gt;
+[2] Tutorial on tuning for ARM CPU: &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html&quot;&gt;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html&lt;/a&gt;&lt;br /&gt;
+[3] Tutorial on tuning for Mobile GPU: &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_mobile_gpu.html&quot;&gt;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_mobile_gpu.html&lt;/a&gt;&lt;br /&gt;
+[4] Tutorial on tuning for NVIDIA/AMD GPU: &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_cuda.html&quot;&gt;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_cuda.html&lt;/a&gt;&lt;br /&gt;
+[5] Paper about AutoTVM: &lt;a href=&quot;https://arxiv.org/abs/1805.08166&quot;&gt;Learning to Optimize Tensor Program&lt;/a&gt;&lt;br /&gt;
+[6] Paper about Intel CPU (by AWS contributors) :  &lt;a href=&quot;https://arxiv.org/abs/1809.02697&quot;&gt;Optimizing CNN Model Inference on CPUs&lt;/a&gt;&lt;/p&gt;
+
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Building a Cross-Framework Deep Learning Compiler via DLPack</title>
+   <link href="https://tvm.apache.org/2018/08/10/DLPack-Bridge"/>
+   <updated>2018-08-10T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2018/08/10/DLPack-Bridge</id>
+   <content type="html">&lt;p&gt;Deep learning frameworks such as Tensorflow, PyTorch, and ApacheMxNet provide a
+powerful toolbox for quickly prototyping and deploying deep learning models.
+Unfortunately, their ease-of-use has often come at the cost of fragmentation: it
+is only easy to use each framework in isolation. Vertical integration has made
+development streamlined for common use cases, but venturing off of the beaten
+path can be tricky.&lt;/p&gt;
+
+&lt;p&gt;One scenario that is poorly supported is passing tensors
+&lt;em&gt;directly&lt;/em&gt; from one framework to another in memory, without any data duplication
+or copies. Supporting such a use case would enable users to string together
+pipelines where certain operators are better supported in one framework (or
+faster) than another efficiently. A shared data representation between
+frameworks would also bridge this gap, and allow compiler stacks to target a
+single format when generating code for operators.&lt;/p&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://github.com/dmlc/dlpack&quot;&gt;DLPack&lt;/a&gt; is an intermediate in-memory
+representation standard for tensor data structures. With DLPack as a common
+representation, we can leverage TVM in scripts written for frameworks that
+traditionally could only rely on vendor-provided libraries. TVM packed functions
+can operate on DLPack tensors, providing wrappers bridging tensor data
+structures from frameworks such as PyTorch and MxNet &lt;em&gt;with zero-data-copy&lt;/em&gt;.&lt;/p&gt;
+
+&lt;p&gt;DLPack presents a simple, portable in-memory data structure:&lt;/p&gt;
+&lt;div class=&quot;language-c highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;typedef&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;struct&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*!
+   * \brief The opaque data pointer points to the allocated data.
+   *  This will be CUDA device pointer or cl_mem handle in OpenCL.
+   *  This pointer is always aligns to 256 bytes as in CUDA.
+   */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The device context of the tensor */&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;DLContext&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief Number of dimensions */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ndim&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The data type of the pointer*/&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;DLDataType&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The shape of the tensor */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;int64_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*!
+   * \brief strides of the tensor,
+   *  can be NULL, indicating tensor is compact.
+   */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;int64_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;strides&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The offset in bytes to the beginning pointer to data */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;uint64_t&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;byte_offset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DLTensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;As an example, we declare and compile a matrix multiplication operator in TVM,
+and build a wrapper that uses the DLPack representation to allow this operator
+to support PyTorch tensors. We also repeat this demonstration with MxNet. This
+extension allows machine learning developers to quickly port research code to
+relatively unsupported hardware platforms without sacrificing performance.&lt;/p&gt;
+
+&lt;p&gt;Illustration of how DLPack provides an intermediate wrapper that is shared
+between frameworks and TVM:&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/pytorch-dlpack/dlpack.png&quot; alt=&quot;image&quot; width=&quot;65%&quot; /&gt;&lt;br /&gt;
+Figure 1&lt;/p&gt;
+
+&lt;p&gt;First, we compute a reference output in PyTorch:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;mm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We then define and build a TVM matrix multiplication operator, using the default
+schedule:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;convert&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;& [...]
+    &lt;span class=&quot;n&quot;&gt;X&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;Y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+
+    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
+    &lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class= [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fmm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&q [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;For brevity, we do not cover TVM’s large collection of scheduling primitives
+that we can use to optimize matrix multiplication. If you wish to make a custom
+GEMM operator run &lt;em&gt;fast&lt;/em&gt; on your hardware device, a detailed tutorial can be
+found &lt;a href=&quot;https://docs.tvm.ai/tutorials/optimize/opt_gemm.html&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;We then convert the TVM function into one that supports PyTorch tensors:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib.dlpack&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_pytorch_func&lt;/span&gt;
+    &lt;span class=&quot;c1&quot;&gt;# fmm is the previously built TVM function (Python function)
+&lt;/span&gt;    &lt;span class=&quot;c1&quot;&gt;# fmm is the wrapped TVM function (Python function)
+&lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;fmm_pytorch&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_pytorch_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fmm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;z2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fmm_pytorch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;  [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;and verify that the results match.&lt;/p&gt;
+
+&lt;p&gt;We can repeat the same example, but using MxNet instead:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;mxnet&lt;/span&gt;
+    &lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib.mxnet&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_mxnet_func&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cpu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span clas [...]
+    &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;f_mxnet&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_mxnet_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;f_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;under-the-hood-of-the-pytorch-example&quot;&gt;Under the hood of the PyTorch Example&lt;/h2&gt;
+&lt;p&gt;As TVM provides &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L455&quot;&gt;functions&lt;/a&gt; to convert dlpack tensors to tvm &lt;code class=&quot;highlighter-rouge&quot;&gt;NDArray&lt;/code&gt;s and
+vice-versa, so all that is needed is some syntactic sugar by wrapping functions.
+&lt;code class=&quot;highlighter-rouge&quot;&gt;convert_func&lt;/code&gt; is a generic converter for frameworks using tensors with dlpack
+support, and can be used to implement convenient converters, such as
+&lt;code class=&quot;highlighter-rouge&quot;&gt;to_pytorch_func&lt;/code&gt;.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;convert_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_type&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+    &lt;span class=&quot;k&quot;&gt;assert&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;callable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;_wrapper&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ndarray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_dlpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;to_dlpack_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
+            &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_type&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;f [...]
+        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_wrapper&lt;/span&gt;
+
+&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;to_pytorch_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch&lt;/span&gt;
+    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch.utils.dlpack&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;convert_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&l [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>VTA: An Open, Customizable Deep Learning Acceleration Stack </title>
+   <link href="https://tvm.apache.org/2018/07/12/vta-release-announcement"/>
+   <updated>2018-07-12T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2018/07/12/vta-release-announcement</id>
+   <content type="html">&lt;p style=&quot;text-align: center&quot;&gt;Thierry Moreau(VTA architect), Tianqi Chen(TVM stack), Ziheng Jiang†(graph compilation), Luis Vega(cloud deployment)&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Advisors: Luis Ceze, Carlos Guestrin, Arvind Krishnamurthy&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington&lt;/p&gt;
+
+&lt;p&gt;Hardware acceleration is an enabler for ubiquitous and efficient deep learning. With hardware accelerators appearing in the datacenter and edge devices, hardware specialization has taken on a prominent role in the deep learning system stack.&lt;/p&gt;
+
+&lt;p&gt;We are excited to announce the launch of the Versatile Tensor Accelerator (VTA, pronounced &lt;em&gt;vita&lt;/em&gt;), an open, generic, and customizable deep learning accelerator design. VTA is a programmable accelerator that exposes a RISC-like programming abstraction to describe tensor-level operations. We designed VTA to expose the most salient and common characteristics of mainstream deep learning accelerators, such as tensor operations, DMA load/stores, and explicit comput [...]
+
+&lt;p&gt;VTA is more than a standalone accelerator design: it’s an end-to-end solution that includes drivers, a JIT runtime, and an optimizing compiler stack based on TVM. The current release includes a behavioral hardware simulator, as well as the infrastructure to deploy VTA on low-cost FPGA hardware for fast prototyping. By extending the TVM stack with a customizable, and open source deep learning hardware accelerator design, we are exposing a transparent end-to-end deep learning stac [...]
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_stack.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The VTA and TVM stack together constitute a blueprint for end-to-end, accelerator-centric deep learning system that can:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Provide an open deep learning system stack for hardware, compilers, and systems researchers alike to incorporate optimizations and co-design techniques.&lt;/li&gt;
+  &lt;li&gt;Lower the barrier of entry for machine learning practitioners to experiment with novel network architectures, operators and data representations that require specialized hardware support.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;use-case-scenarios-for-researchers&quot;&gt;Use-Case Scenarios for Researchers&lt;/h2&gt;
+
+&lt;p&gt;We highlight ways in which the VTA design together with a complete TVM software stack can enable novel opportunities across hardware, compilers, and deep learning research.&lt;/p&gt;
+
+&lt;h3 id=&quot;hardware-designers-and-computer-architects&quot;&gt;Hardware Designers and Computer Architects&lt;/h3&gt;
+
+&lt;p&gt;With new ASIC designs being regularly announced, providing a complete and usable software stack on top of novel hardware is essential to gain a competitive edge both in research circles, and commercially.
+Our VTA release provides a reference TVM software stack built for hardware accelerators.
+We hope to empower hardware designers to quickly build and deploy optimized deep learning libraries ready to be utilized by high-level frameworks of the likes of TensorFlow or PyTorch.
+Software support is essential for performing full-system evaluation to understand the limits and performance bottlenecks in hardware-accelerated systems.
+With the use of FPGAs as hardware deployment backends, we provide a complete solution for rapid and iterative hardware design prototyping.
+Finally, our vision is to see VTA grow into an collection of hardware designs, eventually leading to an open ecosystem of custom hardware accelerators.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;https://www.acm.org/binaries/content/gallery/acm/ctas/publications/artifact-badges.jpg/artifact-badges.jpg/acm%3Adesktopcta&quot; alt=&quot;image&quot; width=&quot;20%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;In addition, VTA is one of the first hardware-software reproducible &lt;a href=&quot;http://ctuning.org/ae/&quot;&gt;ACM artifacts&lt;/a&gt;, which can serve as a template for reproducible deep learning architecture research.
+The VTA artifact deployable using &lt;a href=&quot;http://cknowledge.org/&quot;&gt;CK&lt;/a&gt;, was presented at ReQuEST 2018, co-located with &lt;a href=&quot;http://cknowledge.org/request-cfp-asplos2018.html&quot;&gt;ASPLOS&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h3 id=&quot;optimizing-compilers-researchers&quot;&gt;Optimizing Compilers Researchers&lt;/h3&gt;
+
+&lt;p&gt;Novel intermediate representations and optimizing compilers of the likes of TVM have been proposed to better take advantage of deep learning workloads characteristics.
+VTA complements TVM to provide accelerator-centric optimization passes, and low-level code generation. Our open-source deep learning compiler stack also aims to emulate the success of LLVM, by allowing the community to improve accelerator-centric compiler support over time, particularly as more hardware variants of VTA emerge.
+The extendability of the compiler stack, combined with the ability to modify the architecture and the programming interface of the hardware back-end should lead to exciting opportunities in hardware-software co-design for deep learning.&lt;/p&gt;
+
+&lt;h3 id=&quot;deep-learning-researchers&quot;&gt;Deep Learning Researchers&lt;/h3&gt;
+
+&lt;p&gt;A transparent and customizable software and hardware stack empowers deep learning researchers to come up with novel neural network operators, and data representations, all the while enabling the complete evaluation of those optimizations on end-to-end systems. Techniques like binarization are currently limited to CPU and GPU evaluations, unless significant engineering resources are dedicated to produce an FPGA or ASIC design that can evaluate the technique’s full energy savings  [...]
+
+&lt;h2 id=&quot;technical-details&quot;&gt;Technical Details&lt;/h2&gt;
+
+&lt;h3 id=&quot;stack-overview&quot;&gt;Stack Overview&lt;/h3&gt;
+
+&lt;p&gt;The VTA deep learning accelerator and TVM stack can bridge the gap between productivity-oriented deep learning frameworks, and performance-focused hardware substrates, such as FPGAs.&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;NNVM, the graph-level optimizer, provides a graph-level Intermediate Representation (IR) used as a common language between different deep learning frameworks to take advantage of graph-level optimizations, such as operator fusion. The NNVM IR is also used to specify data layout and data format constraints: e.g. tiling for tensorization, and bit-packing for ultra-low precision computing.&lt;/li&gt;
+  &lt;li&gt;TVM, the tensor-level optimizer, builds upon the Halide DSL and schedule primitives to provide an optimizing compiler capable of bringing performance portability for deep learning across hardware back-ends. TVM brings novel scheduling primitives that target specialized hardware accelerators, such as tensorization, which lowers computation onto specialized tensor-tensor hardware instructions. In addition, it provides schedule primitives and lowering rules that allow for explic [...]
+  &lt;li&gt;The VTA runtime performs JIT compilation of VTA binaries (instruction streams and micro-kernel code), manages shared memory, and performs synchronization to hand off execution to VTA. The VTA runtime presents an API that looks generic to TVM, to hide complexities of platform-specific bookkeeping tasks. It exposes a C++ API that a TVM module can call into - this simplifies the future inclusion of other hardware accelerator designs, without having to drastically modify the uppe [...]
+  &lt;li&gt;VTA’s two-level ISA provides both (1) a high-level CISC ISA that describes variable latency operations such as DMA loads, or deep learning operators and (2) a low-level, and fixed latency RISC ISA that describe low-level matrix-matrix operations. This two-level ISA allows both code compactness, and expressiveness.&lt;/li&gt;
+  &lt;li&gt;Finally, VTA’s micro-architecture provides a flexible deep learning hardware design specification, that can be conveniently compiled onto other FPGA platforms, and eventually in the long term down to ASICs.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h3 id=&quot;vta-hardware-design-overview&quot;&gt;VTA Hardware Design Overview&lt;/h3&gt;
+
+&lt;p&gt;The Vanilla Tensor Accelerator (VTA) is a generic deep learning accelerator built around a GEMM core, which performs dense matrix multiplication at a high computational throughput.
+The design is inspired by mainstream deep learning accelerators, of the likes of Google’s TPU accelerator. The design adopts decoupled access-execute to hide memory access latency and maximize utilization of compute resources. To a broader extent, VTA can serve as a template deep learning accelerator design, exposing a clean tensor computation abstraction to the compiler stack.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_overview.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The figure above presents a high-level overview of the VTA hardware organization. VTA is composed of four modules that communicate between each other via FIFO queues and single-writer/single-reader SRAM memory blocks, to allow for task-level pipeline parallelism.
+The compute module performs both dense linear algebra computation with its GEMM core, and general computation with its tensor ALU.
+It operates on a register file which instead of storing scalar values, stores tensors of rank 1 or 2.
+The micro-op cache stores low-level code that dictates a sequence of operations to mutate the register file.&lt;/p&gt;
+
+&lt;p&gt;The VTA hardware design template offers modularity to the user, with the option to modify hardware datatypes, memory architecture, the GEMM core dimensions, hardware operators, and pipelining stages.
+Exposing multiple variants of VTA to the compiler stack facilitates the developments of compilers, since we can test TVM’s ability to target an multitude of hardware accelerators, rather than a single design.&lt;/p&gt;
+
+&lt;h3 id=&quot;vta-prototyping-with-vta-simulator-and-pynq-fpga-board&quot;&gt;VTA Prototyping with VTA Simulator and Pynq FPGA Board&lt;/h3&gt;
+
+&lt;p&gt;The VTA release allows users to experiment with hardware acceleration, and accelerator-centric compiler optimizations in two ways.
+The first approach, which doesn’t require special hardware is to run deep learning workloads on a behavioral simulator of the VTA design.
+This simulator back-end is readily available for developers to experiment with.
+The second approach relies on an off-the-shelf and low-cost FPGA development board – the &lt;a href=&quot;http://www.pynq.io/&quot;&gt;Pynq board&lt;/a&gt;, which exposes a reconfigurable FPGA fabric and an ARM SoC.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_system.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The VTA release offers a simple compilation and deployment flow of the VTA hardware design and TVM workloads on the Pynq platform, with the help of an RPC server interface.
+The RPC server handles FPGA reconfiguration tasks and TVM module invocation offloading onto the VTA runtime.
+The VTA runtime system runs on the ARM CPU of the Pynq embedded system, and generates VTA binaries on the fly to offload to the FPGA hardware.
+This complete solution allows for out-of-the-box prototyping on low-cost FPGAs, with an interactive and familiar Python environment, hiding much of the complexity and headaches of FPGA design away from the user.&lt;/p&gt;
+
+&lt;p&gt;For programmers familiar with hardware and FPGAs, we expose the VTA design expressed in HLS C, and provide scripts built on top of the Xilinx toolchains to compile the design into an FPGA bitstream.
+We are currently building a repository of VTA variants, so that users can explore different design variants for their deep learning workloads without having to go through the time consuming FPGA compilation process.&lt;/p&gt;
+
+&lt;h2 id=&quot;performance-assessment&quot;&gt;Performance Assessment&lt;/h2&gt;
+
+&lt;p&gt;&lt;em&gt;VTA is at its early stages of development and we expect more performance improvements and optimizations to come.
+As of now we offer end-to-end performance evaluations on the low-cost Pynq board which incorporates a dated 28nm FPGA fabric.
+While this platform is meant for prototyping (the 2012 FPGA cannot compete with modern ASICs), we are porting VTA to newer high-performance FPGA platforms that will offer more competitive performance.&lt;/em&gt;&lt;/p&gt;
+
+&lt;p&gt;&lt;em&gt;We are working on more experiments and will release new results as they are obtained.&lt;/em&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;resource-utilization-on-resnet-18&quot;&gt;Resource Utilization on ResNet-18&lt;/h3&gt;
+
+&lt;p&gt;A popular method used to assess the efficient use of hardware are roofline diagrams: given a hardware design, how efficiently are different workloads utilizing the hardware compute and memory resources. The roofline plot below shows the throughput achieved on different convolution layers of the ResNet-18 inference benchmark. Each layer has a different arithmetic intensity, i.e. compute to data movement ratio.
+In the left half, convolution layers are bandwidth limited, whereas on the right half, they are compute limited.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_roofline.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The goal behind designing a hardware architecture, and a compiler stack is to bring each workload as close as possible to the roofline of the target hardware.
+The roofline plot shows the effects of having the hardware and compiler work together to maximize utilization of the available hardware resources.
+The technique showcased is latency hiding, which requires explicit dependence tracking at the hardware level, compiler support to partition work, explicit dependence insertion in the instruction stream during code-generation.
+The result is an overall higher utilization of the available compute and memory resources.&lt;/p&gt;
+
+&lt;h3 id=&quot;end-to-end-resnet-18-evaluation&quot;&gt;End to end ResNet-18 evaluation&lt;/h3&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_e2e.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;A benefit of having a complete compiler stack built for VTA is the ability to run end-to-end workloads. This is compelling in the context of hardware acceleration because we need to understand what performance bottlenecks, and Amdahl limitations stand in the way to obtaining faster performance.
+The bar plot above shows inference performance with and without offloading the ResNet convolutional layers to the FPGA-based VTA design, on the Pynq board’s ARM Cortex A9 SoC.
+At a glance, it iss clear that VTA accomplishing its goal, reducing the time it takes to perform convolutions on the CPU (dark blue).
+However, it becomes apparent that other operators need offloading, as they now constitute a new bottleneck.
+This kind of high-level visibility is essential to system designers who want to understand how systems affect end-to-end performance.&lt;/p&gt;
+
+&lt;h2 id=&quot;open-source-effort&quot;&gt;Open Source Effort&lt;/h2&gt;
+&lt;p&gt;VTA is research effort at the Paul G. Allen School Computer Science and Engineering at University of Washington, and is now integrated into the TVM stack. The TVM project follows the Apache open-source model, to create a community maintained project. You are more than welcome to join us and lead the effort.&lt;/p&gt;
+
+&lt;h2 id=&quot;acknowledgements&quot;&gt;Acknowledgements&lt;/h2&gt;
+&lt;p&gt;VTA is a research project that came out of the SAML group, which is generously supported by grants from DARPA and the National Science Foundation and gifts from Huawei, Oracle, Intel and anonymous donors.&lt;/p&gt;
+
+&lt;h2 id=&quot;get-started&quot;&gt;Get Started!&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;TVM and VTA Github page can be found here: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;.&lt;/li&gt;
+  &lt;li&gt;You can get started with easy to follow &lt;a href=&quot;https://docs.tvm.ai/vta/tutorials/index.html&quot;&gt;tutorials on programming VTA with TVM&lt;/a&gt;.&lt;/li&gt;
+  &lt;li&gt;For more technical details on VTA, read our &lt;a href=&quot;https://arxiv.org/abs/1807.04188&quot;&gt;VTA technical report&lt;/a&gt; on ArXiv.&lt;/li&gt;
+&lt;/ul&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Bringing TVM into TensorFlow for Optimizing Neural Machine Translation on GPU</title>
+   <link href="https://tvm.apache.org/2018/03/23/nmt-transformer-optimize"/>
+   <updated>2018-03-23T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2018/03/23/nmt-transformer-optimize</id>
+   <content type="html">&lt;h2 id=&quot;author&quot;&gt;Author&lt;/h2&gt;
+
+&lt;p&gt;This is a guest blogpost contributed by Alibaba Group’s Machine Translation Platform team and PAI-Blade team&lt;/p&gt;
+
+&lt;h2 id=&quot;background&quot;&gt;Background&lt;/h2&gt;
+
+&lt;p&gt;Neural Machine Translation (NMT) is an end-to-end approach for automating translation, with the potential to overcome the weaknesses in conventional phrase-based translation systems. Recently, Alibaba Group is working on deploying NMT service for global e-commerce.&lt;/p&gt;
+
+&lt;p&gt;Currently we are exploiting Transformer [1] as the major backbone in our NMT system since it is more friendly for efficient offline training with on-par (even higher) precison against classical RNN/LSTM-based models. Although Transformer is friendly for the offline training phase as it breaks the dependencies across time steps, it is not quite efficiency for online inference. In our production environment, it has been found that the inference speed of the intial version of Trans [...]
+One paricular challenge we observed, is that batch matmul is a major performance hot-spot in Transformer and the current implementation in cuBLAS is not well optimized.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nmt-transformer/model_arch.png&quot; alt=&quot;image&quot; width=&quot;40%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The results below show that TVM generated kernel (with schdule optimization) brings at least &lt;b&gt;&lt;em&gt;13X&lt;/em&gt;&lt;/b&gt; speed-up for batch matmul computation, and a futher speed up with operator fusion enabled.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nmt-transformer/batch-matmul-bar-charts.png&quot; alt=&quot;image&quot; width=&quot;45%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;batch-matmul&quot;&gt;Batch Matmul&lt;/h2&gt;
+
+&lt;h3 id=&quot;why-batch-matmul&quot;&gt;Why batch matmul&lt;/h3&gt;
+&lt;p&gt;In Transformer, batch matmul is widely used in the computation of multi-head attention. Using batch matmul, multiple heads in the attention layer can run in parallel, which can help improve the computation efficiency of the hardware.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nmt-transformer/batchmatmul.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We conducted a thorough profiling of the Transformer model in the inference phase, and it is shown that batch matmul computation contribute up to ~ 30% of GPU kernel execution time. Using nvprof[2] to do some first-principle analysis of cuBLAS’s batch matmul kernel，it is clearly indicated that current implementation is quite under-performing and several interesting phenomena are observed.&lt;/p&gt;
+
+&lt;h3 id=&quot;what-is-batch-matmul&quot;&gt;What is batch matmul&lt;/h3&gt;
+&lt;p&gt;Typically, a batch matmul computation performs the matrix-matrix multiplication over a batch of matrices. The batch is considered to be “uniform”, i.e. all instances have the same dimensions (M, N, K), leading dimensions (lda, ldb, ldc) and transpositions for their respective A, B and C matrices.&lt;/p&gt;
+
+&lt;p&gt;Batch matmul computation can be described more concretely as follows:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;void BatchedGemm(input A, input B, output C, M, N, K, batch_dimension) {
+  for (int i = 0; i &amp;lt; batch_dimension; ++i)  {
+    DoGemm(A[i],B[i],C[i],M,K,N)
+  }
+}
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h4 id=&quot;batch-matmul-shapes&quot;&gt;Batch matmul shapes&lt;/h4&gt;
+
+&lt;p&gt;In the language translation tasks, shape of the batch matmul is significantly smaller than normal matmul computation in other workloads. The shape in Transformer is relevant to the length of input sentences and that of decoder steps. Normally, it is smaller than 30.&lt;/p&gt;
+
+&lt;p&gt;As to the batch dimension, it is a fixed number given a certain inference batch size. For instance, if 16 is used as batch size with beam size being 4, the batch dimension is 16 * 4 * #head (number of heads in multi-head attention, which is usually 8). The shape of the matrix M, K, N are within the range of  [1, max decode length] or [1, max encode length].&lt;/p&gt;
+
+&lt;h3 id=&quot;performance-issue-of-cublas-batch-matmul&quot;&gt;Performance issue of cuBLAS’ batch matmul&lt;/h3&gt;
+
+&lt;p&gt;Firstly, we make a theoretical FLOPs analysis over the batch matmul kernels. The results are quite interesting: all the batch matmul have limited computation intensity (less than 1 TFLOPs).&lt;/p&gt;
+
+&lt;p&gt;Then we profile the cuBLAS performance of batch matmul with multiple shapes through nvprof. The table below shows some of the metrics obtained on a NVIDIA M40 GPU with CUDA8.0.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;input shape &lt;br /&gt; [batch, M, N, K]&lt;/th&gt;
+      &lt;th&gt;kernel&lt;/th&gt;
+      &lt;th&gt;theoretical FLOPs&lt;/th&gt;
+      &lt;th&gt;nvprof observed FLOPs&lt;/th&gt;
+      &lt;th&gt;theoretical FLOPs / &lt;br /&gt; observed FLOPs&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 17, 17, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;18939904&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;0.87%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 1, 17, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;1114112&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;0.052%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 17, 1, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;1114112&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;0.052%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 30, 30, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;58982400&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;2.74%&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Even with different shapes (varing in M, N, K), all the &lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt; calls execute the same amount of FLOPs, which is much bigger than the theoretical value. It can be inferred that all these different shapes may be padded to a certain shape. Among all these shapes, even in the best case, the theoretical FLOPs is still only 2.74% of the actually executed FLOPs, &lt;em&gt;therefore most of the computation is quite redundant [...]
+
+&lt;p&gt;&lt;b&gt;It is obvious that cuBLAS’ batch matmul implementation is far from efficiency. Thus we use TVM to generate efficient batch matmul kernels for our NMT workloads.&lt;/b&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;batch-matmul-computation&quot;&gt;Batch matmul computation&lt;/h2&gt;
+
+&lt;p&gt;In TVM, a general batch matmul computation can be declared as:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+A = tvm.placeholder((batch, M, K), name='A')
+B = tvm.placeholder((batch, K, N), name='B')
+k = tvm.reduce_axis((0, K), 'k')
+C = tvm.compute((batch, M, N),
+         lambda b, y, x: tvm.sum(A[b, y, k] * B[b, k, x], axis = k),
+         name = 'C')
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;schedule-optimization&quot;&gt;Schedule optimization&lt;/h2&gt;
+
+&lt;p&gt;After declaring the computation, we need to devise our own schedule carefully to squeeze performance potential.&lt;/p&gt;
+
+&lt;h3 id=&quot;tuning-parameters-of-blockthread-numbers&quot;&gt;Tuning parameters of block/thread numbers&lt;/h3&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;  # thread indices
+  block_y = tvm.thread_axis(&quot;blockIdx.y&quot;)
+  block_x = tvm.thread_axis(&quot;blockIdx.x&quot;)
+  thread_y = tvm.thread_axis((0, num_thread_y), &quot;threadIdx.y&quot;)
+  thread_x = tvm.thread_axis((0, num_thread_x), &quot;threadIdx.x&quot;)
+  thread_yz = tvm.thread_axis((0, vthread_y), &quot;vthread&quot;, name=&quot;vy&quot;)
+  thread_xz = tvm.thread_axis((0, vthread_x), &quot;vthread&quot;, name=&quot;vx&quot;)
+
+  # block partitioning
+  BB, FF, MM, PP = s[C].op.axis
+  BBFF = s[C].fuse(BB, FF)
+  MMPP = s[C].fuse(MM, PP)
+  by, ty_block = s[C].split(BBFF, factor = num_thread_y * vthread_y)
+  bx, tx_block = s[C].split(MMPP, factor = num_thread_x * vthread_x)
+  s[C].bind(by, block_y)
+  s[C].bind(bx, block_x)
+  vty, ty = s[C].split(ty_block, nparts = vthread_y)
+  vtx, tx = s[C].split(tx_block, nparts = vthread_x)
+  s[C].reorder(by, bx, vty, vtx, ty, tx)
+  s[C].reorder(by, bx, ty, tx)
+  s[C].bind(ty, thread_y)
+  s[C].bind(tx, thread_x)
+  s[C].bind(vty, thread_yz)
+  s[C].bind(vtx, thread_xz)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;We fuse the outer dimensions of the batch matmul, i.e. the BB and FF of the op’s dimension, normally known as “batch” dimension in batch matmul computation. Then we split the outer and the inner dimensions by a factor of (&lt;code class=&quot;highlighter-rouge&quot;&gt;number_thread * vthread&lt;/code&gt;).&lt;/p&gt;
+
+&lt;p&gt;Strided pattern is not needed in batch matmul, thus the virtual thread number (&lt;code class=&quot;highlighter-rouge&quot;&gt;vthread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;vthread_x&lt;/code&gt;) are both set to 1.&lt;/p&gt;
+
+&lt;h4 id=&quot;finding-the-best-combination-of-number_thread&quot;&gt;Finding the best combination of number_thread&lt;/h4&gt;
+
+&lt;p&gt;The results below are obtained on a NVIDIA M40 GPU device with CUDA8.0.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Input Shape [batch,features,M,N,K]&lt;/th&gt;
+      &lt;th&gt;num_thread_y, num_thread_x&lt;/th&gt;
+      &lt;th&gt;num_vthread_y, num_vthread_x&lt;/th&gt;
+      &lt;th&gt;Time(us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;8,1&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;37.62&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;4,1&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;39.30&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;1,1&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;38.82&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;1,1&lt;/td&gt;
+      &lt;td&gt;256,1&lt;/td&gt;
+      &lt;td&gt;41.95&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;1,1&lt;/td&gt;
+      &lt;td&gt;94.61&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;As learned from &lt;a href=&quot;http://tvmlang.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html&quot;&gt;past experience&lt;/a&gt;, the method to find the best combination of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is through brute-force search. After a brute-force search, the best combination for current shape can be f [...]
+
+&lt;h2 id=&quot;fuse-batch-matmul-with-other-operations&quot;&gt;Fuse batch matmul with other operations&lt;/h2&gt;
+
+&lt;p&gt;Normally, the existing “black-box” cuBLAS library calls play the role as the boundary of the normally used “op fusion” optimization tactics. However, with the generated efficient batch matmul kernel, the fusion boundary can be easily broken, more than just element-wise operations can be fused, thus futher performance improvement can be obtained.&lt;/p&gt;
+
+&lt;p&gt;It is observed from the the computation graph that a batch matmul is always followed by a &lt;em&gt;broadcast add&lt;/em&gt; operation or a &lt;em&gt;transpose&lt;/em&gt; operation. By fusing “add” or “transpose” operation with batch matmul, kernel launch overhead and redundant memory access time can be reduced.&lt;/p&gt;
+
+&lt;p&gt;Batch matmul and broadcast add fusion computation can be declared as follows:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+A = tvm.placeholder((batch_size, features, M, K), name='A')
+# the shape of B is (N, K) other than (K, N) is because B is transposed is this fusion pattern
+B = tvm.placeholder((batch_size, features, N, K), name='B')
+ENTER = tvm.placeholder((batch_size, 1, M, N), name = 'ENTER')
+k = tvm.reduce_axis((0, K), 'k')
+C = tvm.compute(
+           (batch_size, features, M, N),
+           lambda yb, yf, m, x: tvm.sum(A[yb, yf, m, k] * B[yb, yf, x, k], axis = k),
+           name = 'C')
+D = topi.broadcast_add(C, ENTER)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Batch matmul and transpose fusion computation can be declared as:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+A = tvm.placeholder((batch_size, features, M, K), name='A')
+B = tvm.placeholder((batch_size, features, K, N), name='B')
+k = tvm.reduce_axis((0, K), 'k')
+C = tvm.compute(
+           (batch_size, M, features, N),
+           lambda yb, m, yf, x: tvm.sum(A[yb, yf, m, k] * B[yb, yf, k, x], axis = k),
+           name = 'C')
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;h3 id=&quot;fusion-kernel-performance&quot;&gt;Fusion Kernel Performance&lt;/h3&gt;
+
+&lt;p&gt;The shape of [batch=64, heads=8, M=1, N=17, K=128] is chosen to elaborate the performance of the generated code. 17 is chosen as the sequence length since it is the average input length in our production scenarios.&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;tf-r1.4 &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 513.9 us&lt;/li&gt;
+  &lt;li&gt;tf-r1.4 &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (separate): 541.9 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 37.62 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (fused): 38.39 us&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;The kernel fusion optimization brings a further &lt;b&gt;&lt;em&gt;1.7X&lt;/em&gt;&lt;/b&gt; speed-up.&lt;/p&gt;
+
+&lt;h2 id=&quot;integration-with-tensorflow&quot;&gt;Integration with Tensorflow&lt;/h2&gt;
+
+&lt;p&gt;The input shape of batch matmul in our workload is finite and can be enumerated easily in advance. With those pre-defined shapes, we can generate highly optimized CUDA kernel ahead of time (fixed shape computation could bring the best optimization potential). Meanwhile, a general batch matmul kernel suitable for most of the shapes will also be generated to provide a fall-back machanism for the shapes which does not have a corresponding ahead-of-time generated kernel.&lt;/p&gt;
+
+&lt;p&gt;The generated efficient kernels for specific shapes and the fall-back one are integrated into the Tensorflow framework. We develop fused ops, such as BatchMatMulTranspose or BatchMatMulAdd, to launch the specific generated kernel using TVM’s runtime API for certain input shape or invoke the fall-back kernel. A graph optimization pass is conducted to automatically replace the origin batch &lt;em&gt;matmul + add/transpose&lt;/em&gt; pattern with the fused ops. Meanwhile, by combin [...]
+
+&lt;h2 id=&quot;summary&quot;&gt;Summary&lt;/h2&gt;
+&lt;p&gt;Inside Alibaba, we found that TVM is a very productive tool to develop high performance GPU kernels to meet our in-house requirements. In this blog, NMT Transformer model is taken as an example to illustrate our optimization strategy with TVM. Firstly, we locate the hot-spot of Transformer model through first-principle analysis. Then we use TVM to generate highly optimized CUDA kernel to replace cuBLAS version (&lt;b&gt;&lt;em&gt;13X&lt;/em&gt;&lt;/b&gt; speed-up is observed). N [...]
+
+&lt;h2 id=&quot;resources&quot;&gt;Resources&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/Orion34C/tvm-batch-matmul-example/blob/master/tvm_batch_matmul_transpose_m1_kX.py&quot;&gt;TVM implementation of fused batch matmul + transpose computation&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
+&lt;p&gt;[1] &lt;a href=&quot;https://arxiv.org/pdf/1706.03762.pdf&quot;&gt;Attention is All You Need&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[2] &lt;a href=&quot;https://devblogs.nvidia.com/cuda-pro-tip-nvprof-your-handy-universal-gpu-profiler/&quot;&gt;nvprof is Your Handy Universal GPU Profiler&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[3] &lt;a href=&quot;https://github.com/tensorflow/tensorflow/pull/16306&quot;&gt;Add Loop Invariant Node Motion Optimization in GraphOptimizer&lt;/a&gt;&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Compiling Deep Learning Models to WebGL with TVM</title>
+   <link href="https://tvm.apache.org/2018/03/12/webgl"/>
+   <updated>2018-03-12T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2018/03/12/webgl</id>
+   <content type="html">&lt;p&gt;Now TVM comes with a brand-new OpenGL/WebGL backend!
+This blog post explains what it is, and what you can achieve with it.&lt;/p&gt;
+
+&lt;h1 id=&quot;the-openglwebgl-backend&quot;&gt;The OpenGL/WebGL Backend&lt;/h1&gt;
+
+&lt;p&gt;TVM already targets a lot of backends covering a variety of platforms: CPU, GPU,
+mobile devices, etc… This time we are adding another backend: OpenGL/WebGL.&lt;/p&gt;
+
+&lt;p&gt;OpenGL/WebGL enables us to leverage the GPU on an environment which does not
+have CUDA installed. It is, for the time being, the only way of using the GPU
+inside a browser.&lt;/p&gt;
+
+&lt;p&gt;This new backend allows us to use OpenGL/WebGL in 3 different ways:&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;Local OpenGL&lt;/strong&gt;:
+We can compile a deep learning model into OpenGL and directly
+run it on the local machine, entirely using Python.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;WebGL with RPC&lt;/strong&gt;:
+We can compile a deep learning model into WebGL and export
+it as a shared library via Emscripten, with JavaScript host code and WebGL device code. Then
+we can deploy that library through RPC onto a TVM JavaScript runtime system
+running inside a browser.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;WebGL with static library&lt;/strong&gt;:
+We can compile a deep learning model into WebGL,
+link it with the TVM JavaScript runtime system and export the entire package.
+Then we can run the model in a web page on a browser, with no dependency.
+The detailed flow is described in figure 1.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;We rely on Emscripten and its fastcomp LLVM backend to generate the javascript backend.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opengl/webgl-flow.png&quot; alt=&quot;image&quot; width=&quot;65%&quot; /&gt;&lt;br /&gt;
+Figure 1&lt;/p&gt;
+
+&lt;p&gt;See &lt;a href=&quot;https://github.com/dmlc/nnvm/blob/master/tutorials/from_mxnet_to_webgl.py&quot;&gt;here&lt;/a&gt;
+for examples of all three of them.&lt;/p&gt;
+
+&lt;h1 id=&quot;how-is-this-different-from-x&quot;&gt;How is this Different from X?&lt;/h1&gt;
+
+&lt;p&gt;Running a neural network on a browser isn’t an entirely new thing.
+Andrej Karpathy’s &lt;a href=&quot;https://cs.stanford.edu/people/karpathy/convnetjs/&quot;&gt;ConvNetJS&lt;/a&gt;
+and Google’s &lt;a href=&quot;https://deeplearnjs.org/&quot;&gt;DeepLearning.JS&lt;/a&gt; are examples of that.&lt;/p&gt;
+
+&lt;p&gt;So what’s unique about TVM with WebGL? The big difference is that the op kernels
+in TVM are automatically compiled, not handwritten. As shown in Figure 2, TVM
+utilizes a unified AST to define kernels, and compiles it to code on different
+platforms.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opengl/comparison.png&quot; alt=&quot;&quot; width=&quot;50%&quot; /&gt;&lt;br /&gt;
+Figure 2&lt;/p&gt;
+
+&lt;p&gt;This means that:&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;To deploy your existing model to WebGL, you don’t need to write a lot of
+additional code. The NNVM/TVM model definition is the same for all targets, so
+you just need to compile it to a new target.&lt;/li&gt;
+  &lt;li&gt;To add a new op kernel, you only need to define it in TVM once, instead of
+implementing it once for every target. You don’t need to know how to write
+GLSL code to add a new op kernel to WebGL!&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;benchmark&quot;&gt;Benchmark&lt;/h1&gt;
+
+&lt;p&gt;Here we perform a benchmark for a typical workload: image classification using
+resnet18.&lt;/p&gt;
+
+&lt;p&gt;I’m using my &lt;a href=&quot;https://www.asus.com/us/Laptops/N76VZ/&quot;&gt;5-year-old laptop&lt;/a&gt; which
+has an 8-core Intel® Core™ i7-3610QM, and a GTX650M.&lt;/p&gt;
+
+&lt;p&gt;In this benchmark, we download a resnet18 model from the Gluon model zoo, and
+perform end-to-end classification on a cat image. We only measure the model
+execution time (without model/input/parameter loading), and each model is run
+100 times to get an average. The results are shown in figure 3.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opengl/opengl-benchmark.png&quot; alt=&quot;image&quot; /&gt;&lt;br /&gt;
+Figure 3&lt;/p&gt;
+
+&lt;p&gt;The benchmark is run in 4 different settings:&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;CPU (LLVM)&lt;/strong&gt;: The model is compiled into LLVM IR and JIT’ed. Therefore, it is
+run entirely on the CPU.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;OpenCL&lt;/strong&gt;: The model is compiled into OpenCL. There is still some glue code
+compiled to LLVM, which is responsible for setting up and launching OpenCL
+kernels. Then we run it on the local machine.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;OpenGL&lt;/strong&gt;: Same as OpenCL, but compiled to OpenGL.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;WebGL&lt;/strong&gt;: The glue code is compiled to LLVM, and transformed to JavaScript using
+Emscripten’s Fastcomp LLVM backend.
+The device code is compiled to WebGL. We execute the model in Firefox.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;From the result above we can observe that, the TVM OpenGL backend has a similar
+performance as OpenCL. More interestingly, the WebGL version inside the browser
+isn’t significantly slower than desktop OpenGL. Considering that the host code
+is JavaScript, this is quite surprising. This might be due to the fact that
+Emscripten generates &lt;a href=&quot;http://asmjs.org/&quot;&gt;asm.js&lt;/a&gt; which enables dramatic
+optimizations in Firefox.&lt;/p&gt;
+
+&lt;p&gt;This is a first step toward automatic compilation of deep learning models
+into web browser. We expect more performance improvements as we bring
+optimizations into the TVM stack.&lt;/p&gt;
+
+&lt;h2 id=&quot;show-me-the-code&quot;&gt;Show me the Code&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;Checkout &lt;a href=&quot;https://github.com/dmlc/nnvm/blob/master/tutorials/from_mxnet_to_webgl.py&quot;&gt;this complete code example&lt;/a&gt;.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;acknowledgement&quot;&gt;Acknowledgement&lt;/h2&gt;
+&lt;p&gt;We thank the developers of Emscripten for providing the fastcomp toolchain and the helps during the development.&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Optimizing Mobile Deep Learning on ARM GPU with TVM</title>
+   <link href="https://tvm.apache.org/2018/01/16/opt-mali-gpu"/>
+   <updated>2018-01-16T00:00:00-08:00</updated>
+   <id>https://tvm.apache.org/2018/01/16/opt-mali-gpu</id>
+   <content type="html">&lt;p&gt;With the great success of deep learning, the demand for
+deploying deep neural networks to mobile devices is growing rapidly.
+Similar to what we do in desktop platforms, utilizing GPU in mobile devices
+can benefit both inference speed and energy efficiency. However, most
+existing deep learning frameworks do not support mobile GPU very well.
+The difficulty lies at the difference between mobile GPU architecture and
+desktop GPU architecture. It means special effort is required for optimizing on
+mobile GPU. The non-trivial extra work eventually results in the poor support
+of mobile GPU in most deep learning frameworks.&lt;/p&gt;
+
+&lt;p&gt;TVM addresses the difficulty of deploying for different hardwares by
+introducing an unified IR stack, with which the optimization for different
+hardwares can be done easily.  In this post, we show how we use
+&lt;a href=&quot;http://tvmlang.org/2017/08/17/tvm-release-announcement.html&quot;&gt;TVM&lt;/a&gt;/&lt;a href=&quot;http://tvmlang.org/2017/10/06/nnvm-compiler-announcement.html&quot;&gt;NNVM&lt;/a&gt; to
+generate efficient kernels for ARM Mali GPU and do end-to-end compilation.
+In our test on Mali-T860 MP4, compared with
+&lt;a href=&quot;https://developer.arm.com/technologies/compute-library&quot;&gt;Arm Compute Library&lt;/a&gt;,
+our method is 1.4x faster on VGG-16 and 2.2x faster on MobileNet.
+Both graph-level and operator-level optimization contribute
+to this speed up.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opt-mali/end2end.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;center&gt; Figure. Inference Speed of Different Backends on ImageNet&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h1 id=&quot;mali-midgrad-gpu&quot;&gt;Mali Midgrad GPU&lt;/h1&gt;
+&lt;p&gt;We will use Firefly-RK3399 with Mali-T860 MP4 as our test environment,
+so we mainly focus on Mali T8xx below.&lt;/p&gt;
+
+&lt;h2 id=&quot;architecture&quot;&gt;Architecture&lt;/h2&gt;
+&lt;p&gt;Figure 1 is an overview of the Mali Architecture on T860 and T880.
+The GPUs are scalable up to 16 coherent shader cores. Inside each
+shader core, there are 2 or 3 arithmetic pipelines, 1 load/store pipeline
+and 1 texture pipeline (so-called TriPipe). The ALU in each arithmetic
+pipeline has four 128-bit vector units and one scalar units.&lt;/p&gt;
+
+&lt;p&gt;We use OpenCL for GPU computing. When mapping to OpenCL model, each
+shader core executes one or several work groups. Each shader core supports
+up to 384 concurrently executing threads. Each work item in OpenCL
+typically maps to a single thread on a Mali GPU.
+The Mali GPUs use a VLIW (Very Long Instruction Word) architecture.
+Each instruction word contains multiple operations. The Mali GPUs
+also use SIMD, so that most arithmetic instructions operate on
+multiple data elements simultaneously. &lt;sup&gt;[1]&lt;/sup&gt;&lt;/p&gt;
+
+&lt;center&gt; &lt;img width=&quot;50%&quot; src=&quot;/images/opt-mali/mali-arch.png&quot; /&gt; &lt;/center&gt;
+&lt;center&gt; Figure 1. Mali T860 and T880 (source &lt;sup&gt;[2]&lt;/sup&gt;) &lt;/center&gt;
+
+&lt;h2 id=&quot;difference-with-nvidias-gpus&quot;&gt;Difference with NVIDIA’s GPUs&lt;/h2&gt;
+&lt;p&gt;Here are some differences that we should concern when writing OpenCL
+code for Mali GPUs, compared with writing for NVIDIA’s GPUs.&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;Mali GPUs use an unified global memory. In NVIDIA’s GPUs, we usually
+copy data to shared memory, because NVIDIA’s GPUs have physically
+separate global memory, shared memory and register. In Mali, this copy
+does not improve performance and can be removed. Besides, Mali GPUs
+usually share the global memory with CPU, so there is no need for copying
+between CPU and GPU.&lt;/li&gt;
+  &lt;li&gt;Mali Midgrad GPUs are based on SIMD (Single Instruction Multiple Data)
+and need explicit vectorization. In NVIDIA CUDA, parallelism is
+achieved by SIMT (Single Instruction Multiple Thread), which does
+not require explicit vectorization. But also notice that the newer
+Mali Bitfrost GPUs are based on quad-style vectorization and does not
+require explicit vectorization.&lt;/li&gt;
+  &lt;li&gt;All threads in Mali GPUs have individual program counters. It means
+the &lt;code class=&quot;highlighter-rouge&quot;&gt;warp size&lt;/code&gt; is 1, so that branch divergence is not a major problem.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;optimization--convolution-as-example&quot;&gt;Optimization : Convolution as Example&lt;/h1&gt;
+&lt;p&gt;The convolution layer is the core of most deep neural networks and
+takes most of the computation time. So we take the convolution layer
+as example to demonstrate how common optimization techniques like
+packing, tiling, unrolling and vectorization are applied in TVM.&lt;/p&gt;
+
+&lt;h2 id=&quot;im2col-with-gemm&quot;&gt;Im2Col with GEMM&lt;/h2&gt;
+&lt;p&gt;A well-known algorithm for convolution layer is &lt;a href=&quot;https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/&quot;&gt;im2col&lt;/a&gt;,
+which converts the little 3D input cubes to columns of a matrix and
+perform a GEMM. The advantage of this method is easy utilization of
+highly optimized BLAS library.  However, the memory redundancy
+(9x memory for 3x3 kernel) is awful.&lt;/p&gt;
+
+&lt;h2 id=&quot;spatial-packing&quot;&gt;Spatial Packing&lt;/h2&gt;
+&lt;p&gt;Instead, we adopt a method to calculate the convolution, and apply the
+optimization techniques step by step. A convolution layer in VGG-16
+is used as tuning case, whose configuration is listed below.
+We assume the batch size is 1 for inference.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Input Shape&lt;/th&gt;
+      &lt;th&gt;Output Shape&lt;/th&gt;
+      &lt;th&gt;Kernel Size&lt;/th&gt;
+      &lt;th&gt;Stride&lt;/th&gt;
+      &lt;th&gt;Padding&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;56x56x256&lt;/td&gt;
+      &lt;td&gt;56x56x256&lt;/td&gt;
+      &lt;td&gt;3x3&lt;/td&gt;
+      &lt;td&gt;(1, 1)&lt;/td&gt;
+      &lt;td&gt;(1, 1)&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;As a baseline, we also list the performance of this layer in
+Arm Compute Library.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;declare-the-computation-tiling-and-packing&quot;&gt;Declare the computation: tiling and packing&lt;/h3&gt;
+&lt;p&gt;Tiling and packing are two methods intended for better memory access.
+Tiling separates the whole computation into small blocks for better
+datareuse.  Packing re-layouts the input matrices according to the
+tiling so that we can access the memory sequentially, which reduces
+cache miss rate.&lt;/p&gt;
+
+&lt;p&gt;We do tiling on the width dimension of the input image and CO dimension
+of the filter matrix.  This is described by &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.compute&lt;/code&gt;.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tiling factor
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;VW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# get input shape
+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;TH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;H_PAD&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;TW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;W_PAD&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# calc output shape
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;OH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;H_PAD&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;n&quot;&gt;OW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;W_PAD&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt;&lt;span class=&qu [...]
+
+&lt;span class=&quot;c1&quot;&gt;# data shape after packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dvshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;TH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;s [...]
+
+&lt;span class=&quot;c1&quot;&gt;# kernel shape after packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kvshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&l [...]
+
+&lt;span class=&quot;n&quot;&gt;ovshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OH&lt;/span&gt; &lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;oshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OW&lt;/span&gt;&lt;span class=&quo [...]
+
+&lt;span class=&quot;c1&quot;&gt;# define packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&l [...]
+    &lt;span class=&quot;n&quot;&gt;data_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&q [...]
+
+&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt [...]
+    &lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&q [...]
+
+&lt;span class=&quot;c1&quot;&gt;# define convolution
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt [...]
+&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ovshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;sp [...]
+    &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+            &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt [...]
+            &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;sp [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unpack to correct layout
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;oshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/ [...]
+                     &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;//&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt [...]
+                     &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'output_unpack'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tag&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'direct_conv_output'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We can inspect the defined IR by&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;I pick the convolution part here.&lt;/p&gt;
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;produce conv {
+  for (co, 0, 64) {
+    for (h, 0, 56) {
+      for (w, 0, 14) {
+        for (vw.init, 0, 4) {
+          for (vc.init, 0, 4) {
+            conv[((((((((co*56) + h)*14) + w)*4) + vw.init)*4) + vc.init)] = 0.000000f
+          }
+        }
+        for (ci, 0, 256) {
+          for (kh, 0, 3) {
+            for (kw, 0, 3) {
+              for (vw, 0, 4) {
+                for (vc, 0, 4) {
+                  conv[((((((((co*56) + h)*14) + w)*4) + vw)*4) + vc)] = (conv[((((((((co*56) + h)*14) + w)*4) + vw)*4) + vc)] + (data_vec[(((((((((h*14) + w)*256) + ci)*3) + kh)*6) + kw) + vw)]*kernel_vec[((((((((co*256) + ci)*3) + kh)*3) + kw)*4) + vc)]))
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h3 id=&quot;kernel-1-bind-thread&quot;&gt;Kernel 1: bind thread&lt;/h3&gt;
+&lt;p&gt;In TVM, we declare the computation at first and then &lt;em&gt;schedule&lt;/em&gt; it.
+This mechanism decouples the algorithm and implementation detail. (This idea
+is from &lt;a href=&quot;http://halide-lang.org/&quot;&gt;Halide&lt;/a&gt;).&lt;/p&gt;
+
+&lt;p&gt;The following schedule simply binds axes to GPU threads, so that
+our code can run on Mali GPU.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# helper function for binding thread
+&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/s [...]
+    &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot; tile and bind 3d &quot;&quot;&quot;&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;or&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z_factor&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x_factor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x_factor&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;or&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+
+&lt;span class=&quot;c1&quot;&gt;# set tunable parameter
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule data packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+
+&lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
+
+&lt;span class=&quot;c1&quot;&gt;# schedule conv
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
+
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;With this schedule, our code can run now, but the performance is terrible.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+      &lt;th&gt;speedup&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+      &lt;td&gt;1x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 1: simple bind&lt;/td&gt;
+      &lt;td&gt;5.6154&lt;/td&gt;
+      &lt;td&gt;0.6588&lt;/td&gt;
+      &lt;td&gt;0.03x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;kernel-2-unrolling&quot;&gt;Kernel 2: unrolling&lt;/h3&gt;
+&lt;p&gt;Loop unrolling can reduce the instructions for loop control, reduce
+branch penalties and hide latency in reading memory.
+In TVM, this can be done easily by calling &lt;code class=&quot;highlighter-rouge&quot;&gt;s.unroll(axis)&lt;/code&gt;&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tunable parameter
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule data packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
+
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule conv
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
+
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+      &lt;th&gt;speedup&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+      &lt;td&gt;1x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 1: simple bind&lt;/td&gt;
+      &lt;td&gt;5.6154&lt;/td&gt;
+      &lt;td&gt;0.6588&lt;/td&gt;
+      &lt;td&gt;0.03x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 2: + unrolling&lt;/td&gt;
+      &lt;td&gt;0.3707&lt;/td&gt;
+      &lt;td&gt;9.9796&lt;/td&gt;
+      &lt;td&gt;0.49x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;kernel3-vectorization&quot;&gt;Kernel3: vectorization&lt;/h3&gt;
+&lt;p&gt;As mentioned before, we need to do vectorization explictly
+ in order to achieve the best performance on Mali GPU.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tunable parameter
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule data packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unroll
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unroll
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! VECTORIZE HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule conv
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
+
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unroll
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! VECTORIZE HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+      &lt;th&gt;speedup&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+      &lt;td&gt;1x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 1: simple bind&lt;/td&gt;
+      &lt;td&gt;5.6154&lt;/td&gt;
+      &lt;td&gt;0.6588&lt;/td&gt;
+      &lt;td&gt;0.03x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 2: + unrolling&lt;/td&gt;
+      &lt;td&gt;0.3707&lt;/td&gt;
+      &lt;td&gt;9.9796&lt;/td&gt;
+      &lt;td&gt;0.49x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 3: + vectorization&lt;/td&gt;
+      &lt;td&gt;0.1304&lt;/td&gt;
+      &lt;td&gt;28.3679&lt;/td&gt;
+      &lt;td&gt;1.40x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;how-to-set-the-tunable-parameter&quot;&gt;How to set the tunable parameter&lt;/h3&gt;
+&lt;p&gt;As for the tunable parameters above, some can be calculated.
+For the vectorized dimension &lt;code class=&quot;highlighter-rouge&quot;&gt;VC&lt;/code&gt;, we should fill the 128-bit register,
+so it can be set as 128/32=4 for float32 and 128/16=8 for float16.&lt;/p&gt;
+
+&lt;p&gt;But more often we cannot determine the optimal value, due to the
+complicated runtime. We use grid search in TVM. It can be
+done extremely effective since we write python code in TVM’s high-level
+IR rather than direct OpenCL code.&lt;/p&gt;
+
+&lt;h3 id=&quot;the-generated-opencl-code&quot;&gt;The generated OpenCL code&lt;/h3&gt;
+&lt;p&gt;We can view the generated OpenCL code by&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/sp [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;The OpenCL code is too long to be pasted here, and it is hard to read due
+to heavy unrolling. If interested, you can view it
+&lt;a href=&quot;https://github.com/merrymercy/tvm-mali/blob/master/data/kernels.cl&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h1 id=&quot;end-to-end-benchmarking&quot;&gt;End-to-End Benchmarking&lt;/h1&gt;
+&lt;p&gt;In this section, we compare the comprehensive performance between
+different backends on some popular deep neural networks.
+Our test environment is&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Firefly-RK3399 4G
+CPU: dual-core Cortex-A72 + quad-core Cortex-A53
+GPU: Mali-T860MP4
+
+Arm Compute Library : v17.12
+MXNet: v1.0.1
+Openblas: v0.2.18
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We use NNVM and TVM to do end-to-end compilation.&lt;/p&gt;
+
+&lt;h2 id=&quot;performance&quot;&gt;Performance&lt;/h2&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opt-mali/end2end.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;center&gt; Figure 2. Inference Speed of Different Backends on ImageNet&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;As shown in Figure 2, we test the inference speed on ImageNet.
+On Firefly-RK3399, Mali GPU can be 2x ~ 4x faster than 6-core big.LITTLE CPU.
+Our end-to-end pipeline is 1.4x ~ 2.2x faster than Arm Compute Library.
+We try both GEMM and direct method of convolution layer in
+Arm Compute Library, GEMM method is always faster than direct method
+in these test cases, so we only plot the result of GEMM method.&lt;/p&gt;
+
+&lt;p&gt;Some results, like resnet18 on Arm Compute Library, are missing in the Figure 2.
+It is because the graph runtime of Arm Compute Library does not support
+skip connection currently and has a poor neon implementation of
+depthwise convolution.  This also reflects the advantage of NNVM
+software stack.&lt;/p&gt;
+
+&lt;h2 id=&quot;half-precision-performance&quot;&gt;Half-Precision Performance&lt;/h2&gt;
+&lt;p&gt;Precision in deep neural networks is not very important, especially
+for the inference on mobile devices. Using low-precision arithmetic
+can make the inference much faster. We also test the half-precision
+floating number on Mali GPU.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;model&lt;/th&gt;
+      &lt;th&gt;backend&lt;/th&gt;
+      &lt;th&gt;Time Cost per Image (second)&lt;/th&gt;
+      &lt;th&gt;speed up to FP32&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;vgg16&lt;/td&gt;
+      &lt;td&gt;ACM-mali&lt;/td&gt;
+      &lt;td&gt;0.9694&lt;/td&gt;
+      &lt;td&gt;1.69&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;vgg16&lt;/td&gt;
+      &lt;td&gt;TVM-mali&lt;/td&gt;
+      &lt;td&gt;0.6896&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;1.87x&lt;/strong&gt;&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;MobileNet 1.0&lt;/td&gt;
+      &lt;td&gt;TVM-mali&lt;/td&gt;
+      &lt;td&gt;0.0479&lt;/td&gt;
+      &lt;td&gt;1.60x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;ResNet18&lt;/td&gt;
+      &lt;td&gt;TVM-mali&lt;/td&gt;
+      &lt;td&gt;0.1183&lt;/td&gt;
+      &lt;td&gt;1.73x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;center&gt; Table 1. Inference Speed of FP16 on ImageNet&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;In theory, FP16 can both double peak compute and halve memory consumption,
+so that doubling the speed. But it needs good input shape for
+longer vectorization and fine-tuning some parameters.&lt;/p&gt;
+
+&lt;h2 id=&quot;further-work-on-mobile-devices&quot;&gt;Further Work on Mobile Devices&lt;/h2&gt;
+&lt;p&gt;We should admit that there is still some room for improvement,
+mainly at the graph level, such as model compression and weight prelayout.
+Further improvement in NNVM will try to solve these problems.&lt;/p&gt;
+
+&lt;h1 id=&quot;show-me-the-code&quot;&gt;Show me the code&lt;/h1&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/merrymercy/tvm-mali&quot;&gt;End-to-End benchmark&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/topi/python/topi/mali&quot;&gt;Convolution and Depthwise Convolution Schedule&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;bio--acknowledgement&quot;&gt;Bio &amp;amp; Acknowledgement&lt;/h1&gt;
+&lt;p&gt;&lt;a href=&quot;https://lmzheng.net&quot;&gt;Lianmin Zheng&lt;/a&gt; is an undergraduate
+student at SJTU Apex lab.  He is interested in machine learning
+and building computer system.&lt;/p&gt;
+
+&lt;p&gt;The author has many thanks to
+&lt;a href=&quot;https://homes.cs.washington.edu/~tqchen/&quot;&gt;Tianqi Chen&lt;/a&gt; for his helpful
+advice and &lt;a href=&quot;https://github.com/yzhliu&quot;&gt;Yizhi Liu&lt;/a&gt; for his earlier work.&lt;/p&gt;
+
+&lt;h1 id=&quot;reference&quot;&gt;Reference&lt;/h1&gt;
+&lt;p&gt;[1] &lt;a href=&quot;https://developer.arm.com/docs/100614/0302&quot;&gt;ARM Mali GPU OpenCL Developer Guide&lt;/a&gt;
+[2] &lt;a href=&quot;https://developer.arm.com/&quot;&gt;ARM Developer&lt;/a&gt;&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Remote Profile and Test Deep Learning Cross Compilation on Mobile Phones with TVM RPC</title>
+   <link href="https://tvm.apache.org/2017/11/08/android-rpc-introduction"/>
+   <updated>2017-11-08T00:00:00-08:00</updated>
+   <id>https://tvm.apache.org/2017/11/08/android-rpc-introduction</id>
+   <content type="html">&lt;p&gt;TVM stack is an end to end compilation stack to deploy deep learning workloads to all hardware backends.
+Thanks to the NNVM compiler support of TVM stack, we can now directly compile descriptions from deep learning frameworks and compile them to bare metal code.
+An impressive feature OF tvm is its ability to deploy computation workloads on different platforms, such as GPUs and mobile phones (will support more hardward backends).&lt;/p&gt;
+
+&lt;p&gt;However, when we want to test and profile cross compilation, it is hard to test different computation workloads on a heterogeneous device such as raspberry pi or a mobile phone.
+In order to optimize a computation task, one has to edit the code on the development PC, compile, deploy to the device, test, then modify the codes again to see whether it accelerates. The workflow looks like,&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/flow1.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Is there any way to speed up this process?&lt;/p&gt;
+
+&lt;p&gt;Today we introduce an approach to deploy and test TVM workloads on Android Phones. We develop a TVM runtime for Java and build an Android APP upon it. The Android APP takes shared library as input and runs compiled functions on the mobile phone. Thus our workflow simplifies to,&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/flow2.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;With the help of the TVM RPC, one can build TVM functions and NDArrays on a remote device. The ability to cross-compile to different platforms makes it easy to develop on one platform and test on another.&lt;/p&gt;
+
+&lt;p&gt;The process is illustrated as following:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/arch.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;run-tvm-app-on-android-phone&quot;&gt;Run TVM APP on Android Phone&lt;/h2&gt;
+
+&lt;p&gt;You can find Android RPC APP in &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/android_rpc&quot;&gt;apps/android_rpc&lt;/a&gt;. Please follow the instruction to build for your Android device. Once the APK is built, sign it using &lt;code class=&quot;highlighter-rouge&quot;&gt;apps/android_rpc/dev_tools&lt;/code&gt; and install it on the phone. The APP looks like:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/app.png&quot; alt=&quot;image&quot; width=&quot;25%&quot; /&gt;
+&lt;img src=&quot;/images/android_rpc/app_error.png&quot; alt=&quot;image&quot; width=&quot;25%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Usually we cannot start a standalone server on mobile phone, instead we start an proxy server and use our app to connect.&lt;/p&gt;
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;python &lt;span class=&quot;nt&quot;&gt;-m&lt;/span&gt; tvm.exec.rpc_proxy
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;create-ndarray-on-the-phone&quot;&gt;Create NDArray on the Phone&lt;/h2&gt;
+
+&lt;p&gt;Now we can connect to the proxy server from the laptop:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;9090&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;This will give us a handler &lt;code class=&quot;highlighter-rouge&quot;&gt;remote&lt;/code&gt; which we can use to communicate with the mobile phone. For instance, the following lines create a 1024x1024 matrix on phone’s GPU:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span [...]
+	&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;&lt;span cla [...]
+	&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;When &lt;code class=&quot;highlighter-rouge&quot;&gt;A.asnumpy()&lt;/code&gt; is called from the laptop, the matrix &lt;code class=&quot;highlighter-rouge&quot;&gt;A &lt;/code&gt;will be copied to phone’s RAM and then transfer to the laptop through the proxy server. The TVM RPC interface is transparent to users.&lt;/p&gt;
+
+&lt;h2 id=&quot;gemm-matrix-multiplication-on-the-phone&quot;&gt;GEMM (Matrix Multiplication) on the Phone&lt;/h2&gt;
+
+&lt;p&gt;Now we are going to introduce how to test matrix multiplication on an Android phone. First let’s define the very simple GEMM schedule:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;gemm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
+    &lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
+    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
+
+    &lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+        &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ii&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;jj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span  [...]
+        &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'C'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;threadIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;bo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;to&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=&q [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class=& [...]
+
+    &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&q [...]
+
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+    	&lt;span class=&quot;s&quot;&gt;&quot;opencl&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
+    	&lt;span class=&quot;n&quot;&gt;target_host&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;llvm -target=arm64-linux-android&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
+    	&lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;There’s nothing special except the last line. Here we set the target to ‘opencl’ since this is the computation language which our Mali GPU supports. Note that we set &lt;code class=&quot;highlighter-rouge&quot;&gt;target_host&lt;/code&gt; to ‘&lt;code class=&quot;highlighter-rouge&quot;&gt;llvm -target=arm64-linux-android&lt;/code&gt;’, it depends on what architecture your Android Phone is. We tested on Samsung Galaxy S6 Edge, which has a Mali-T760 GPU. Here is the CPU info for  [...]
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nv&quot;&gt;$ &lt;/span&gt;adb shell
+shell@zenltechn:/ &lt;span class=&quot;nv&quot;&gt;$ &lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;cat&lt;/span&gt; /proc/cpuinfo
+Processor	: AArch64 Processor rev 2 &lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;aarch64&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt;
+processor	: 0
+processor	: 1
+processor	: 2
+processor	: 3
+processor	: 4
+processor	: 5
+processor	: 6
+processor	: 7
+Features	: fp asimd aes pmull sha1 sha2 crc32
+CPU implementer	: 0x41
+CPU architecture: AArch64
+CPU variant	: 0x0
+CPU part	: 0xd03
+CPU revision	: 2
+
+Hardware	: SAMSUNG Exynos7420
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Please refer to &lt;a href=&quot;https://clang.llvm.org/docs/CrossCompilation.html#target-triple&quot;&gt;target triple&lt;/a&gt; to learn the compile options for LLVM.&lt;/p&gt;
+
+&lt;p&gt;We use &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.contrib.ndk&lt;/code&gt; to build the shared library for the Android system,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;util&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;, [...]
+&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gemm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bn&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;256&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;util&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tempdir&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relpath&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;export_library&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ndk&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_shared&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;ndk.create_shared&lt;/code&gt; reads the environment variable &lt;code class=&quot;highlighter-rouge&quot;&gt;TVM_NDK_CC&lt;/code&gt; to find the compiler &amp;amp; linker for the Android device. We can easily use NDK to generate standalone toolchain for our device. For example, the following commands generate standalone compilers and linkers for ARM64 Android devices.&lt;/p&gt;
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nb&quot;&gt;cd&lt;/span&gt; /opt/android-ndk/build/tools/
+./make-standalone-toolchain.sh &lt;span class=&quot;nt&quot;&gt;--platform&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;android-24 &lt;span class=&quot;nt&quot;&gt;--use-llvm&lt;/span&gt; &lt;span class=&quot;nt&quot;&gt;--arch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;arm64 &lt;span class=&quot;nt&quot;&gt;--install-dir&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;/opt/android-toolchain-arm64
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;If everything goes right, we’ve got a shared library ‘gemm_gpu.so’. Now let’s upload it to the mobile phone, make the phone load the module and get a remote handler,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&qu [...]
+
+&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;upload&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Create the remote arrays and print the running time,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;sp [...]
+
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;numpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
+
+&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+
+&lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;time_evaluator&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;entry_name&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &l [...]
+&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;span class=&quot [...]
+&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;%&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;g secs/op, &lt;/span&gt;&lt;span class=&quot;si&quot;&gt;%&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;g GFLOPS'&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;%&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt;&lt [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Now we can verify the results on PC,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_almost_equal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;decimal&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;In the case above, we develop and cross-compile to a binary file for our mobile phone. Through the proxy server, the binary is uploaded to the phone and run in its JVM. This approach makes it easy to develop and test different computation workloads on Android.&lt;/p&gt;
+
+&lt;h2 id=&quot;java-runtime-for-tvm&quot;&gt;Java Runtime for TVM&lt;/h2&gt;
+
+&lt;p&gt;The Android APP is built on top of the Java runtime, which provides minimum supports for TVM Function and NDArray. Here’s an example for registering function in tvm4j,&lt;/p&gt;
+
+&lt;div class=&quot;language-java highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nc&quot;&gt;Function&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;func&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;Function&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;convertFunc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt; [...]
+      &lt;span class=&quot;nd&quot;&gt;@Override&lt;/span&gt; &lt;span class=&quot;kd&quot;&gt;public&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;Object&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;invoke&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;TVMValue&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot [...]
+        &lt;span class=&quot;nc&quot;&gt;StringBuilder&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;new&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;StringBuilder&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;TVMValue&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;append&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;asString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;());&lt;/span&gt;
+        &lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;toString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+      &lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;o&quot;&gt;});&lt;/span&gt;
+&lt;span class=&quot;nc&quot;&gt;TVMValue&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushArg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;Hello&quot;&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushA [...]
+&lt;span class=&quot;n&quot;&gt;assertEquals&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;Hello World!&quot;&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;asString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;());&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;As we have seen in the GEMM part, one can build shared library by Python and execute it by Java,&lt;/p&gt;
+
+&lt;div class=&quot;language-java highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;ml.dmlc.tvm.Module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;ml.dmlc.tvm.NDArray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;ml.dmlc.tvm.TVMContext&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;java.io.File&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;java.util.Arrays&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+
+&lt;span class=&quot;kd&quot;&gt;public&lt;/span&gt; &lt;span class=&quot;kd&quot;&gt;class&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;LoadAddFunc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;kd&quot;&gt;public&lt;/span&gt; &lt;span class=&quot;kd&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;main&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;String&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;{&lt;/span&gt;
+    &lt;span class=&quot;nc&quot;&gt;String&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;loadingDir&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;];&lt;/span&gt;
+    &lt;span class=&quot;nc&quot;&gt;Module&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;fadd&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;Module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;load&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;loadingDir&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;File&lt; [...]
+
+    &lt;span class=&quot;nc&quot;&gt;TVMContext&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;TVMContext&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;cpu&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+
+    &lt;span class=&quot;kt&quot;&gt;long&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;new&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;long&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;};&lt;/span&gt;
+    &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span& [...]
+    &lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;copyFrom&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;new&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+    &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span& [...]
+
+    &lt;span class=&quot;n&quot;&gt;fadd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;entryFunc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;().&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushArg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushArg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span& [...]
+    &lt;span class=&quot;nc&quot;&gt;System&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;println&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;Arrays&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;toString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&g [...]
+
+    &lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fadd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+  &lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Once you have built TVM library following the &lt;a href=&quot;http://docs.tvmlang.org/how_to/install.html&quot;&gt;Installation Guide&lt;/a&gt;, run&lt;/p&gt;
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;make jvmpkg
+make jvminstall
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;This will compile, package and install tvm4j in your local maven repository. Please refer to &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/jvm&quot;&gt;tvm4j&lt;/a&gt; for more information.&lt;/p&gt;
+
+&lt;h2 id=&quot;remote-profile-and-test-on-iphoneipad&quot;&gt;Remote Profile and Test on iPhone/iPad&lt;/h2&gt;
+
+&lt;p&gt;Besides the Android RPC application, we also provide an &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/ios_rpc&quot;&gt;iOS RPC app&lt;/a&gt;, through which we can easily profile and test TVM computation workloads on iPhone or iPad. It works almost the same as that on Android, while XCode and an iOS device are required.&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Bringing AMDGPUs to TVM Stack and NNVM Compiler with ROCm</title>
+   <link href="https://tvm.apache.org/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm"/>
+   <updated>2017-10-30T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm</id>
+   <content type="html">&lt;p style=&quot;text-align: center&quot;&gt;Aditya Atluri, Advanced Micro Devices, Inc.&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Masahiro Masuda, Ziosoft, Inc.&lt;/p&gt;
+
+&lt;p&gt;We are pleased to announce a new GPU backend for TVM stack - ROCm backend for AMD GPUs. If you are not familiar with TVM, you can refer to &lt;a href=&quot;http://tvmlang.org/2017/08/17/tvm-release-announcement.html&quot;&gt;the earlier announcement&lt;/a&gt; first. In short, TVM stack is an end to end compilation stack to deploy deep learning workloads to all hardware backends. Today’s announcement focuses on the code generator support for AMD GPUs. Specifically, we developed a [...]
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/tvm_rocm_overview.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM stack is developed by an open source community under Apache-2.0 License. ROCm backend support is done with the help from community. Aditya first implemented codegen and runtime. He was later joined by Masahiro. Masahiro’s full time job is not related to TVM or AMD GPUs. Nonetheless, TVM got him excited and he has been involved in fixing bugs, resolving all failing unittests, and adding math function support to codegen.&lt;/p&gt;
+
+&lt;h2 id=&quot;rocm-stack&quot;&gt;ROCm stack&lt;/h2&gt;
+
+&lt;p&gt;Radeon Open Compute is open-source initiative by AMD to leverage compute power of current and future generation GPUs. ROCm software stack is a great tool to express and run most commonly used GPU programming models and achieve peak performance. Not only ROCm is an open-source stack, it is an open stack, which means all the ISA and hardware features are well documented and programmable by developers. Developers can experiment with different programming models and try out multiple [...]
+
+&lt;p&gt;TVM leverages the open-source feature of ROCm stack by using LLVM AMDGPU backend code generator. TVM translates from its intermediate representation (IR) to LLVM intermediate representation. This is the place where ROCm stack open-source feature takes control. TVM’s LLVM AMDGPU CodeGen pass converts LLVM IR into GPU assembly and object code, which is later called to run the whole network or group of layers or single layer.&lt;/p&gt;
+
+&lt;p&gt;On ROCm stack, there is no virtual ISA, you get what you ask for not less not more. Hence, one can schedule operations in a kernel at a granularity of a single instruction, without worrying about instruction reordering and other optimizations you do not ask for.&lt;/p&gt;
+
+&lt;h2 id=&quot;using-nnvm-compiler-with-rocm-backend&quot;&gt;Using NNVM Compiler with ROCm backend&lt;/h2&gt;
+
+&lt;p&gt;Thanks to TVM stack, we can directly compile models from popular deep learning frameworks such as MXNet and PyTorch into AMD GPU assembly using NNVM compiler, today. With ROCm backend, the generic workflow becomes as follows.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/rocm_workflow.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We have put together working examples of compiling models from MXNet and PyTorch with NNVM, and running them on AMD GPUs with ROCm backend. More frameworks are supported via the NNVM compiler stack. The repository is available &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;The script &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm/blob/master/mxnet_imagenet_inference.py&quot;&gt;mxnet_imagenet_inference.py&lt;/a&gt; demonstrates Imagenet inference on AMD GPUs with recently introduced MXNet-Gluon model. It does the following:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Loads Resnet 50 model from &lt;a href=&quot;https://mxnet.incubator.apache.org/versions/master/api/python/gluon/model_zoo.html&quot;&gt;the Gluon model zoo&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Converts Gluon Resnet 50 model to NNVM graph format, using &lt;code class=&quot;highlighter-rouge&quot;&gt;nnvm.frontend.from_mxnet (...)&lt;/code&gt;&lt;/li&gt;
+  &lt;li&gt;Compiles and executes the graph with ROCm backend&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;The example comes with an image of the following cat.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/cat.png&quot; alt=&quot;image&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Running our network, it predicts this image as “tigar cat”, among 1000 categories.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-plain&quot; data-lang=&quot;plain&quot;&gt;$ python mxnet_imagenet_inference.py
+Testing model resnet50_v1
+x (1, 3, 224, 224)
+TVM prediction top-1: 282 tiger cat&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;The script &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm/blob/master/advanced_superres_onnx.py&quot;&gt;advanced_superres_onnx.py&lt;/a&gt; gives an example of loading a model trained with PyTorch. The model is stored in the &lt;a href=&quot;https://onnx.ai/&quot;&gt;ONNX&lt;/a&gt; format. In this example, our network takes an low resolution image as input, and outputs a 4x high resolution image. We refer the details of a problem setup and the network archit [...]
+
+&lt;p&gt;In order to use models in the ONNX format with NNVM, we first use &lt;a href=&quot;https://github.com/onnx/onnx&quot;&gt;the ONNX library&lt;/a&gt; to load the ONNX model into the Protocol buffer object. We can then use &lt;code class=&quot;highlighter-rouge&quot;&gt;nnvm.frontend.from_onnx(...)&lt;/code&gt; to obtain an equivalent NNVM graph. With a NNVM graph in hand, we can follow the generic workflow of compilation and graph execution outlined above.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/butterfly.png&quot; alt=&quot;image&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The input to the network is a 64 x 64 image on the left, and it outputs a 256 x 256 image on the right. On the middle is a 256 x 256 image obtained simply by resizing the input image with bicubic interpolation. The network outputs an image of far better quality.&lt;/p&gt;
+
+&lt;p&gt;The input images are taken from the original paper, and they are available &lt;a href=&quot;https://twitter.app.box.com/s/lcue6vlrd01ljkdtdkhmfvk7vtjhetog&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h2 id=&quot;a-note-on-performance&quot;&gt;A Note on performance&lt;/h2&gt;
+
+&lt;p&gt;The current support on ROCm focuses on the functionality coverage. We have already seen promising performance results by simply adopting existing TVM schedules for CUDA backend. For example, you can try running &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/recipe/gemm/cuda_gemm_square.py&quot;&gt;the gemm test script&lt;/a&gt; in the TVM repository and see the result. For two types of cards we tested, the current gemm recipe for square matrix multiplication (not  [...]
+This is already a promising start, as it is very hard to optimize performance to get to peak and we
+did not yet apply AMD GPU specific optimizations.
+We are starting to look at performance optimization and we expect more improvement to come.&lt;/p&gt;
+
+&lt;h2 id=&quot;walkthrough-of-rocm-backend&quot;&gt;Walkthrough of ROCm backend&lt;/h2&gt;
+
+&lt;p&gt;In the following part of this article we focus on explaining how to use ROCm backend when working with TVM directly. All you need to do is to build your TVM function under the target “rocm” and create a runtime context for it. Here, we show an example of ROCm backend usage, following ‘Vector Add Example’ in TVM’s &lt;a href=&quot;http://docs.tvmlang.org/tutorials/get_started.html#vector-add-example&quot;&gt;getting started tutorial&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;We start by setting up a compute operation and a schedule for the vector add kernel. This step is independent of a backend.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;__future__&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;absolute_import&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;print_function&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;numpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;n&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&q [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p& [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&qu [...]
+
+&lt;p&gt;Next, to use ROCm backend we build our kernel under “rocm” target. This will cause TVM to use our new code generator. We also need a runtime context for ROCm backend.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;rocm&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;After building the kernel and setting up a runtime context, we can launch our vector add kernel.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+
+&lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt; &l [...]
+
+&lt;p&gt;We can view LLVM IR that TVM generates in the following way:&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;llvm&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;You should see something like this:&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-llvm&quot; data-lang=&quot;llvm&quot;&gt;&lt;span class=&quot;c1&quot;&gt;; ModuleID = 'myadd__kernel0'&lt;/span&gt;
+&lt;span class=&quot;err&quot;&gt;sour&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;datalayout&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64&quot;&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;triple&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;amdgcn-amd-amdhsa-hcc&quot;&lt;/span&gt;
+
+
+&lt;span class=&quot;c1&quot;&gt;; Function Attrs: nounwind&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;err&quot;&gt;amdgpu_ker&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;ne&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;l&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k [...]
+&lt;span class=&quot;nl&quot;&gt;entry:&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workgroup.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workitem.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%6&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;-127&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%7&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;ashr&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%6&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;6&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%8&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;icmp&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;slt&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%7&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i1&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%8&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_then&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_else&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_then:&lt;/span&gt;                                          &lt;span class=&quot;c1&quot;&gt;; preds = %entry&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%9&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;shl&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;6&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end.sink.split&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_end.sink.split:&lt;/span&gt;                                &lt;span class=&quot;c1&quot;&gt;; preds = %if_else, %if_then&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;phi&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%21&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_else&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span [...]
+  &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%12&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%13&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%15&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;fadd&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%18&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%19&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;k&quot;&gt;store&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt; [...]
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_end:&lt;/span&gt;                                           &lt;span class=&quot;c1&quot;&gt;; preds = %if_end.sink.split, %if_else&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;ret&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_else:&lt;/span&gt;                                          &lt;span class=&quot;c1&quot;&gt;; preds = %entry&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%20&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sub&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%21&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;shl&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;6&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%22&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;icmp&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;slt&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%21&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%20&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i1&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%22&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end.sink.split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;We can also view GPU assembly that ROCm backend generates. This is the real code that runs on your GPU.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;asm&quot;&lt;/span&gt;&lt;sp [...]
+
+&lt;p&gt;The assembly should look something like this, omitting unnecessary details:&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-plain&quot; data-lang=&quot;plain&quot;&gt;        s_load_dword s1, s[4:5], 0x18
+        v_mov_b32_e32 v2, -1
+        v_mov_b32_e32 v1, 0
+        s_waitcnt lgkmcnt(0)
+        s_add_i32 s0, s1, 0xffffff81
+        s_ashr_i32 s0, s0, 6
+        s_cmp_ge_i32 s6, s0
+        s_cbranch_scc0 BB0_2
+        v_sub_i32_e32 v1, vcc, s1, v0
+        s_lshl_b32 s0, s6, 6
+        v_cmp_lt_i32_e32 vcc, s0, v1
+        v_mov_b32_e32 v2, 0
+        v_cndmask_b32_e64 v1, 0, -1, vcc
+BB0_2:
+        v_cmp_ne_u32_e32 vcc, 0, v2
+        v_cndmask_b32_e64 v2, 0, 1, vcc
+        v_cmp_ne_u32_e32 vcc, 1, v2
+        s_and_b64 vcc, exec, vcc
+        s_cbranch_vccnz BB0_4
+        s_lshl_b32 s0, s6, 6
+        v_mov_b32_e32 v1, -1
+BB0_4:
+        v_cmp_ne_u32_e32 vcc, 0, v1
+        v_mov_b32_e32 v1, s0
+        s_and_saveexec_b64 s[0:1], vcc
+        s_xor_b64 s[0:1], exec, s[0:1]
+        s_cbranch_execz BB0_6
+BB0_5:
+        s_load_dwordx2 s[2:3], s[4:5], 0x0
+        s_load_dwordx2 s[6:7], s[4:5], 0x8
+        v_add_i32_e32 v0, vcc, v1, v0
+        s_load_dwordx2 s[4:5], s[4:5], 0x10
+        v_ashrrev_i32_e32 v1, 31, v0
+        v_lshlrev_b64 v[0:1], 2, v[0:1]
+        s_waitcnt lgkmcnt(0)
+        v_add_i32_e32 v2, vcc, s4, v0
+        v_mov_b32_e32 v3, s5
+        v_addc_u32_e32 v3, vcc, v3, v1, vcc
+        flat_load_dword v2, v[2:3]
+        v_add_i32_e32 v4, vcc, s6, v0
+        v_mov_b32_e32 v3, s7
+        v_addc_u32_e32 v5, vcc, v3, v1, vcc
+        flat_load_dword v4, v[4:5]
+        v_mov_b32_e32 v3, s3
+        v_add_i32_e32 v0, vcc, s2, v0
+        v_addc_u32_e32 v1, vcc, v3, v1, vcc
+        s_waitcnt vmcnt(0) lgkmcnt(0)
+        v_add_f32_e32 v2, v2, v4
+        flat_store_dword v[0:1], v2
+BB0_6:
+        s_or_b64 exec, exec, s[0:1]
+        s_endpgm&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;Links&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Github page of NNVM Compiler: &lt;a href=&quot;https://github.com/dmlc/nnvm&quot;&gt;https://github.com/dmlc/nnvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Github page of TVM: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Examples of ROCm backend with NNVM: &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm&quot;&gt;https://github.com/ROCmSoftwarePlatform/nnvm-rocm&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>NNVM Compiler: Open Compiler for AI Frameworks</title>
+   <link href="https://tvm.apache.org/2017/10/06/nnvm-compiler-announcement"/>
+   <updated>2017-10-06T08:30:00-07:00</updated>
+   <id>https://tvm.apache.org/2017/10/06/nnvm-compiler-announcement</id>
+   <content type="html">&lt;p style=&quot;text-align: center&quot;&gt;Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Amazon Web Service AI team&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;DMLC open-source community&lt;/p&gt;
+
+&lt;p&gt;Deep learning has become ubiquitous and indispensable. We are seeing a rising need for deploying deep learning workloads on many kinds of platforms such as mobile phones, GPU, IoT devices and specialized accelerators.  Last month, we announced TVM stack to close the gap between deep learning frameworks, and the performance- or efficiency-oriented hardware backends.  TVM stack makes it easy to build an end to end compilation for a deep learning framework.  However, we think it wo [...]
+
+&lt;p&gt;Today, UW Allen school and AWS AI team, together with other contributors, are excited to announce the release of NNVM compiler, an open deep learning compiler to compile front-end framework workloads directly to hardware backends. We build it using the two-level intermediate representation(IR) in the TVM stack.
+The reader is welcome to refer to the &lt;a href=&quot;http://www.tvmlang.org/2017/08/17/tvm-release-announcement.html&quot;&gt;original TVM announcement&lt;/a&gt; for more technical details about TVM stack. With the help of TVM stack, NNVM compiler can:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Represent and optimize the common deep learning workloads in high level graph IR&lt;/li&gt;
+  &lt;li&gt;Transform the computation graph to minimize memory utilization, optimize data layout and fuse computation patterns for different hardware backends.&lt;/li&gt;
+  &lt;li&gt;Present an end to end compilation pipeline from front-end deep learning frameworks to bare metal hardwares.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_compiler_stack.png&quot; alt=&quot;image&quot; width=&quot;612px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The NNVM compiler can directly take models from deep learning frameworks such as Apache MXNet.
+It also support model exchange formats such as ONNX and CoreML. ONNX support enables NNVM to compile deep learning models from PyTorch, Caffe2 and CNTK.
+The CoreML frontend enables deployment of CoreML models to non-iOS devices.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_compiler_code.png&quot; alt=&quot;image&quot; width=&quot;712px&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;separation-of-optimization-and-deployment&quot;&gt;Separation of Optimization and Deployment&lt;/h2&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_deploy.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;NNVM compiler applies graph level and tensor level optimizations and jointly optimize them to get the best performance. We take a different approach from existing deep learning frameworks, which packages the graph optimization with the deployment runtime.  NNVM compiler adopts the conventional wisdom from compiler to separate the optimization from the actual deployment runtime. This approach offers substantial optimization but still keeps the runtime lightweight. The compiled mo [...]
+
+&lt;h2 id=&quot;performance&quot;&gt;Performance&lt;/h2&gt;
+
+&lt;p&gt;NNVM compiler is still under active development, and we can expect more improvements to come, but we have started to see promising results.
+We benchmarked its performance and compared it against Apache MXNet on two typical hardware configurations: ARM CPU on Raspberry PI and Nvidia GPU on AWS. Despite the radical architecture difference between these two chips, we can use the same infrastructure and only need to change the schedule for each type of hardware.&lt;/p&gt;
+
+&lt;h3 id=&quot;nvidia-gpu&quot;&gt;Nvidia GPU&lt;/h3&gt;
+
+&lt;p&gt;GPU benchmarks and schedules are contributed by Leyuan Wang (AWS/UCDavis) and Yuwei Hu (TuSimple). We compared the NNVM compiler against Apache MXNet with CUDA8 and cuDNN7 as the backend on Nvidia K80. This is a very strong baseline, as Apache MXNet turns on auto-tuning to select the best kernel from CuDNN. We also used the optimized depthwise kernel in MXNet to optimize MobileNet workload.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_k80_result.png&quot; alt=&quot;image&quot; width=&quot;400px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;As can be seen, NNVM compiler generate code that outperforms Apache MXNet on K80. These improvements are due to the joint graph level and kernel level optimizations. It is worth noting that NNVM compiler generates all the optimized GPU kernels on its own without relying on external libraries like CuDNN.&lt;/p&gt;
+
+&lt;h3 id=&quot;raspberry-pi-3b&quot;&gt;Raspberry Pi 3b&lt;/h3&gt;
+
+&lt;p&gt;The Rasberry Pi compilation stack is contributed by Ziheng Jiang(AWS/FDU).
+We compared NNVM compiler against Apache MXNet with OpenBLAS and NNPack.
+We explored the setups to get the best performance out of MXNet: we turned on Winograd convolution in the NNPACK for 3x3 convolutions, enabled multi-threading and disabled the additional scheduler thread (so all threads are used by NNPack).&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_rasp_result.png&quot; alt=&quot;image&quot; width=&quot;400px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;As can be seen, the code generated by NNVM compiler is two times faster on ResNet18.
+The gap on MobileNet is mainly due to lack of depthwise convolution in existing CPU DNN libraries. NNVM compiler takes benefit of direct generating efficient ARM code directly.&lt;/p&gt;
+
+&lt;h2 id=&quot;acknowledgement&quot;&gt;Acknowledgement&lt;/h2&gt;
+&lt;p&gt;This project wouldn’t become possible without our early contributors in the DMLC community.
+We would like to specially thank Yuwei Hu(TuSimple), Leyuan Wang(AWS/UCDavis), Joshua Z. Zhang(AWS)
+and Xingjian Shi(HKUST) for their early contributions to the project. We would also like to thank all the contributors
+to the TVM stack.&lt;/p&gt;
+
+&lt;p&gt;We also learnt a lot from the following projects when building NNVM Compiler.&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/Theano/Theano&quot;&gt;Theano&lt;/a&gt;: possibly the earliest compiler for deep learning&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/halide/Halide&quot;&gt;Halide&lt;/a&gt;: TVM uses &lt;a href=&quot;https://github.com/dmlc/HalideIR&quot;&gt;HalideIR&lt;/a&gt; as data structure for
+arithematic simplification and low level lowering. HalideIR is derived from Halide.
+We also learns from Halide when implementing the lowering pipeline in TVM.&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/inducer/loopy&quot;&gt;Loopy&lt;/a&gt;: use of integer set analysis and its loop transformation primitives.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;links&quot;&gt;Links&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;Github page of NNVM Compiler: &lt;a href=&quot;https://github.com/dmlc/nnvm&quot;&gt;https://github.com/dmlc/nnvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Github page of TVM: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://news.cs.washington.edu/2017/10/06/allen-school-and-aws-team-up-on-new-nnvm-compiler-for-deep-learning-frameworks/&quot;&gt;UW Allen school blog about NNVM compiler&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://aws.amazon.com/blogs/ai/introducing-nnvm-compiler-a-new-open-end-to-end-compiler-for-ai-frameworks/&quot;&gt;AWS blogpost about NNVM compiler&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>Optimize Deep Learning GPU Operators with TVM: A Depthwise Convolution Example</title>
+   <link href="https://tvm.apache.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example"/>
+   <updated>2017-08-22T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example</id>
+   <content type="html">&lt;p&gt;Efficient deep learning operators are at the core of deep learning systems.
+Usually these operators are hard to optimize and require great efforts of HPC experts.
+&lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;TVM&lt;/a&gt;, an end to end tensor IR/DSL stack, makes this much easier.&lt;/p&gt;
+
+&lt;p&gt;This blog teaches you how to write high-performance GPU operator kernels with the help of TVM.
+We use depthwise convolution (i.e. &lt;a href=&quot;http://docs.tvmlang.org/api/python/topi.html#topi.nn.depthwise_conv2d_nchw&quot;&gt;topi.nn.depthwise_conv2d_nchw&lt;/a&gt;) as an example,
+and demonstrate how we can improve over the already hand optimized CUDA kernel in tensorflow.
+Our final version is 2x-4x faster than the optimized kernel in tf-1.2 under different workloads, and 3x-7x faster with operator fusion enabled.
+Below is the result tested on GTX1080, with filter size = [1, 256, 3, 3], stride = [1, 1], padding = ‘SAME’:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/tf_compare.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;introduction-to-depthwise-convolution&quot;&gt;Introduction to Depthwise Convolution&lt;/h2&gt;
+
+&lt;p&gt;Depthwise convolution is an important building block of modern architectures, such as Xception [1] and MobileNet [2].
+It’s an effective method to reduce the computation complexity of deep neural networks.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/conv_and_depthconv.png&quot; alt=&quot;image&quot; width=&quot;80%&quot; /&gt;&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;source: &lt;a href=&quot;http://machinethink.net/blog/googles-mobile-net-architecture-on-iphone/&quot;&gt;http://machinethink.net/blog/googles-mobile-net-architecture-on-iphone/&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;In TVM, depthwise convolution can be declared as:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# padding stage
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;height_after_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;width_after_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+        &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;all&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt;  [...]
+        &lt;span class=&quot;n&quot;&gt;Input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span  [...]
+    &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;PaddedInput&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;# depthconv stage
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),& [...]
+&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; & [...]
+&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_channel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+        &lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;channel_multiplier&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/spa [...]
+        &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'DepthwiseConv2d'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;general-gpu-optimization-guidelines&quot;&gt;General GPU Optimization Guidelines&lt;/h2&gt;
+
+&lt;p&gt;This part briefly talks about three concepts we should know when optimizing CUDA code: data reuse, shared memory and bank conflicts.
+It would be great if you already know them, then you may skip this part.&lt;/p&gt;
+
+&lt;h3 id=&quot;data-reuse&quot;&gt;Data Reuse&lt;/h3&gt;
+&lt;p&gt;In modern computing architectures, the cost of loading data from memory is much higher than doing a single floating point computation [3].
+Because of that, we always want to reuse the input data after they are loaded into registers or shared memory (cache).&lt;/p&gt;
+
+&lt;p&gt;There are two forms of data reuse in depthwise convolution: filter reuse and input reuse. Filter reuse happens as the filter slides over the input channel and computes multiple times.
+Input reuse is realized through tiling, let’s take 3x3 depthwise conv as an example:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/no_tiling.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Without tiling, each thread computes 1 output element and loads 3x3 input data. 16 threads together have 9x16 loads.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/tiling.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;With tiling, each thread computes 2x2 output elements and loads 4x4 input data. 4 threads together have 16x4 loads.&lt;/p&gt;
+
+&lt;h3 id=&quot;shared-memory-and-bank-conflicts&quot;&gt;Shared Memory and Bank Conflicts&lt;/h3&gt;
+&lt;p&gt;Shared memory can be seen as cache in GPU. It is on-chip and much faster than global memory.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/GPU_memory_hierarchy.png&quot; alt=&quot;image&quot; width=&quot;256px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Shared memory is allocated per block. It’s common practice to load data from global memory into shared memory, and then all threads in the block read data from shared memory.&lt;/p&gt;
+
+&lt;p&gt;The size of shared memory is limited (usually 48K), so we must be cautious of shared memory overflow.
+Besides, too much shared memory allocated to one block limits the number of active blocks per multiprocessor.&lt;/p&gt;
+
+&lt;p&gt;Another performance issue with shared memory is bank conflicts. Shared memory is divided into equally sized memory modules (banks) that can be accessed simultaneously,
+however, if multiple threads access the same memory bank (causing bank conflicts), the accesses will be serialized, thus decreasing the effective bandwidth.&lt;/p&gt;
+
+&lt;p&gt;Shared memory banks are organized such that successive addresses are assigned to successive banks.
+To avoid bank conflicts, it’s better that successive threads access successive memory addresses, as illustrated below (each color represents one shared memory bank):&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/bank_conflicts.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;For more details on shared memory and bank conflicts, please refer to &lt;a href=&quot;https://devblogs.nvidia.com/parallelforall/using-shared-memory-cuda-cc/&quot;&gt;this Nvidia’s blog&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;Ok, now let’s start optimizing depthwise convolution in TVM.&lt;/p&gt;
+
+&lt;h2 id=&quot;schedule-optimization&quot;&gt;Schedule Optimization&lt;/h2&gt;
+
+&lt;h3 id=&quot;compute-paddedinput-inline-to-save-memory-allocation&quot;&gt;Compute PaddedInput Inline to Save Memory Allocation&lt;/h3&gt;
+&lt;p&gt;As we see from part 1, padding is declared explicitly as a separate stage. We compute it inline to avoid redundant memory allocation:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h3 id=&quot;divide-one-large-channel-into-smaller-blocks&quot;&gt;Divide One Large Channel into Smaller Blocks&lt;/h3&gt;
+&lt;p&gt;One straightforward schedule for depthwise convolution is that one cuda block takes care of one input channel and corresponding filters, loading them into shared memory and then computing:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;IS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/spa [...]
+&lt;span class=&quot;n&quot;&gt;FS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;shared&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span& [...]
+&lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.y&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;# bind the dimension of batch (N in NCHW) with block_y
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;c1&quot;&gt;# bind the dimension of channel (C in NCHW) with block_x
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;s [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We test the average time cost of 1000 runs on GTX 1080, and compare with &lt;a href=&quot;https://www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution#depthwise_conv2d&quot;&gt;depthwise_conv2d in tensorflow&lt;/a&gt;.
+Here is the result:&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Filter&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;stride&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;tf-1.2 SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 21, 21]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;16.1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;9.1&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;34.8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;14.5&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 64, 64]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;130.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;98.9&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;251.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;387.4&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;As we can see, this schedule performs well with small channel size like 21 x 21 or 32 x 32, however, its performance drops seriously as the channel size increases to larger than 64 x 64.
+One main reason is that too much shared memory allocated to one block limits the number of active blocks per multiprocessor.&lt;/p&gt;
+
+&lt;p&gt;We modify the schedule to divide one large channel into smaller blocks. For example, one channel (64 x 64 or 96 x 96) is divided into blocks of 32 x 32,
+and one cuda block takes care of one 32 x 32 block:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;blocking_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;blocking_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of height (H in NCHW)
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of width (W in NCHW)
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;c1&quot;&gt;# assign one 32 x 32 block to one cuda block
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt;&lt;span class=& [...]
+&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=& [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Here is the new result:&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;[blocking_h, blocking_w]&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;tf-1.2 SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 64, 64]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;130.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;63.4&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;251.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;132.5&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Our blocking strategy works! For 64 x 64 channel size, it brings 1.6x acceleration (98.9us -&amp;gt; 63.4us); for 96 x 96 channel size, it brings 2.9x acceleration (387.4us -&amp;gt; 132.5us).&lt;/p&gt;
+
+&lt;h3 id=&quot;tuning-parameters-of-thread-numbers&quot;&gt;Tuning Parameters of Thread Numbers&lt;/h3&gt;
+
+&lt;p&gt;How to schedule the workload, say, 32x32 among the threads of one cuda block? Intuitively, it should be like this:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class= [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;There are two parameters in the schedule: &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt;. How to determine the optimal combination of them? 
+Well, let’s first do some experiments. Below is the result with Filter = [256, 1, 3, 3] and stride = [1, 1]:&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Case&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_thread_y&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_thread_x&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;9.7&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;2&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8.8&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;3&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;17.7&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32.5&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Many interesting observations from above results:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;
+    &lt;p&gt;Case 2 is faster than case 1. In case 2, each thread computes a 8x1 tile in output, which corresponds to a 10x3 tile in input.
+It has better data reuse than case 1’s 4x1 tile.&lt;/p&gt;
+  &lt;/li&gt;
+  &lt;li&gt;
+    &lt;p&gt;Case 3 is slower than case 2. It’s because in case 3, the workload per thread is too large and leads to much cost of local memory read.&lt;/p&gt;
+  &lt;/li&gt;
+  &lt;li&gt;
+    &lt;p&gt;Case 4 is slower than case 3. It’s because &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x = 32&lt;/code&gt; ensures no bank conflicts, while &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y = 32&lt;/code&gt; doesn’t.&lt;/p&gt;
+  &lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;To summarize what we learn from above observations:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Large tile is good for data reuse, but not good for local memory read.&lt;/li&gt;
+  &lt;li&gt;The influence of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; on bank conflicts is asymmetric.&lt;/li&gt;
+  &lt;li&gt;To find the optimal combination of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is to achieve a balance of efficient shared memory access (avoid bank conflicts), data reuse, and local memory read.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Pretty tricky. So, what exactly should we do to find the optimal combination? The answer is brute force search. 
+We can pass &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; as arguments to the schedule function, and try all possible combinations to find the optimal one. This can be done easily in TVM:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot; [...]
+    &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;do_schedule_as_usual&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inf&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;all_possible_combinations&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;test_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;optimal_combination&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;In fact, it can be seen as a simple auto scheduler.&lt;/p&gt;
+
+&lt;h3 id=&quot;vthread-and-strided-patterns&quot;&gt;Vthread and Strided Patterns&lt;/h3&gt;
+&lt;p&gt;Vthread (virtual thread) in TVM is introduced to support strided patterns. We can use it this way:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;num_vthread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_vthread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of height (H in NCHW) twice
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vyi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of width (W in NCHW) twice
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vxi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;c1&quot;&gt;# bind thread and vthread respectively
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt; [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt;&lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=& [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Let’s print the IR to see what vthread does:&lt;/p&gt;
+
+&lt;div class=&quot;language-c++ highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/* Input = [1, 1, 32, 32], Filter = [1, 1, 3, 3], stride = [1, 1], padding = 'SAME' */&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.y, Range(min=0, extent=8), threadIdx.y)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.x, Range(min=0, extent=8), threadIdx.x)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)& [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;) [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;) [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;) [...]
+      &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+      &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Without vthread (just set to 1), the IR is:&lt;/p&gt;
+
+&lt;div class=&quot;language-c++ highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/* Input = [1, 1, 32, 32], Filter = [1, 1, 3, 3], stride = [1, 1], padding = 'SAME' */&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.y, Range(min=0, extent=8), threadIdx.y)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.x, Range(min=0, extent=8), threadIdx.x)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)& [...]
+      &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+      &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;As we can see, when &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_y = 2&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_x = 2&lt;/code&gt;, the 32 x 32 channel is divided into four sub-channels of 16 x 16.
+Each thread computes four output elements at a time, one element in one sub-channel.&lt;/p&gt;
+
+&lt;p&gt;Below is the result with Filter = [256, 1, 3, 3], stride = [1, 1], blocking_h = 32, blocking_w = 32:&lt;/p&gt;
+
+&lt;style&gt;
+table th:nth-of-type(1) {
+    width: 120px;
+}
+table th:nth-of-type(2) {
+    width: 120px;
+}
+&lt;/style&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Case&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_thread_y, num_thread_x&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_vthread_y, num_vthread_x&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8, 8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;132.5&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;2&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8, 8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;103.1&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;3&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4, 32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;95.9&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8, 16&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 2&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;90.9&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Case 2 is faster than case 1. It’s because in case 2 &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x=8&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_x=4&lt;/code&gt; together ensures that consecutive threads access consecutive memory addresses,
+thus avoiding bank conflicts, as illustrated below (each color represents one thread’s workload):&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/vthread_and_strided_pattern.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;In theory case 3 and 4 should be the same fast, since they have the same workload per thread, and both enjoy efficient shared memory access. Somehow case 4 is just a little faster.&lt;/p&gt;
+
+&lt;p&gt;Still remember tensorflow’s speed? It’s 251.6us, and now TVM is 2.8x faster. 387.4 -&amp;gt; 132.5 -&amp;gt; 95.9 -&amp;gt; 90.9, blocking helps the most; tuning thread numbers saves 37us;
+vthread saves additional 5us.&lt;/p&gt;
+
+&lt;p&gt;In fact, TVM can be extremely faster than tensorflow with large kernel size or channel_multiplier (because of more filter reuse) :&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Filter&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;stride&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;tf-1.2 SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;How faster is TVM&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;251.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;90.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;2.8x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 5, 5]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;597.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;128.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4.6x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 2, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;659.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;143.7&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4.6x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 2, 5, 5]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1203.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;170.5&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;7.1x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h2 id=&quot;operator-fusion&quot;&gt;Operator Fusion&lt;/h2&gt;
+
+&lt;p&gt;One typical optimization we can do in deep learning is operator fusion, that computes multiple operators together in a single kernel without saving intermediate results back to global memory.
+TVM supports that out of the box.&lt;/p&gt;
+
+&lt;p&gt;Consider a common pattern in neural networks: &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;relu&lt;/code&gt;. We can fuse the three operators into one, by slightly modifying the original schedule:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;depthwise_c [...]
+&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;scale_shift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/s [...]
+&lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# is no longer DepthwiseConv2d
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# this line fuses ScaleShift, explicitly
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_scope&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;local&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;c1&qu [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# schedule for Output the same way we schedule for DepthwiseConv2d as discussed above
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_at&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt; [...]
+&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;It generates IR like this:&lt;/p&gt;
+
+&lt;div class=&quot;language-c++ highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/* Input = [1, 1, 32, 32], Filter = [1, 1, 3, 3], stride = [1, 1], padding = 'SAME' */&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [DepthwiseConv2d] storage_scope = &quot;local&quot;&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;allocate&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/s [...]
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.y, Range(min=0, extent=8), threadIdx.y)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.x, Range(min=0, extent=8), threadIdx.x)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+      &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &l [...]
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt [...]
+          &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+      &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+      &lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt [...]
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;As we can see, each thread computes &lt;code class=&quot;highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;relu&lt;/code&gt; before writing the result of &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; to global memory. The fused operator is as fast as single &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;.
+Below is the result with Input = [1, 256, 96, 96], Filter = [256, 1, 3, 3], stride = [1, 1], padding = ‘SAME’:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;tf-1.2 &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;: 251.6 us&lt;/li&gt;
+  &lt;li&gt;tf-1.2 &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;relu&lt;/code&gt; (separate): 419.9 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;: 90.9 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d + scale_shift + relu&lt;/code&gt; (fused): 91.5 us&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;The advantage of operator fusion is obvious.&lt;/p&gt;
+
+&lt;p&gt;This is not the end, TVM can do operator fusion in a smarter way. You may refer to &lt;a href=&quot;https://github.com/dmlc/tvm/issues/215&quot;&gt;this&lt;/a&gt; and read the source code provided below.&lt;/p&gt;
+
+&lt;h2 id=&quot;show-me-the-code&quot;&gt;Show me the code&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;Declare: &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/nn/convolution.py&quot;&gt;https://github.com/dmlc/tvm/blob/master/topi/python/topi/nn/convolution.py&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Schedule: &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/depthwise_conv2d.py&quot;&gt;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/depthwise_conv2d.py&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Test: &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/recipe/conv/depthwise_conv2d_test.py&quot;&gt;https://github.com/dmlc/tvm/blob/master/topi/recipe/conv/depthwise_conv2d_test.py&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;acknowledgements&quot;&gt;Acknowledgements&lt;/h2&gt;
+&lt;p&gt;The author has many thanks to Tianqi Chen for his helpful advice and inspiring discussion.&lt;/p&gt;
+
+&lt;h2 id=&quot;bio&quot;&gt;Bio&lt;/h2&gt;
+&lt;p&gt;&lt;a href=&quot;https://Huyuwei.github.io&quot;&gt;Yuwei Hu&lt;/a&gt; is an intern in &lt;a href=&quot;http://tusimple.ai/&quot;&gt;Tusimple&lt;/a&gt;’s HPC group.
+He is experiencing a gap year after obtaining a bachelor’s degree in electrical engineering from Beihang University.&lt;/p&gt;
+
+&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
+&lt;p&gt;[1] &lt;a href=&quot;https://arxiv.org/abs/1610.02357&quot;&gt;Xception: Deep Learning with Depthwise Separable Convolutions&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[2] &lt;a href=&quot;https://arxiv.org/abs/1704.04861&quot;&gt;MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[3] &lt;a href=&quot;http://norvig.com/21-days.html#answers&quot;&gt;Approximate timing for various operations on a typical PC&lt;/a&gt;&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
+   <title>TVM: An End to End IR Stack for Deploying Deep Learning Workloads on Hardware Platforms</title>
+   <link href="https://tvm.apache.org/2017/08/17/tvm-release-announcement"/>
+   <updated>2017-08-17T12:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2017/08/17/tvm-release-announcement</id>
+   <content type="html">&lt;p style=&quot;text-align: center&quot;&gt;Tianqi Chen(project lead), Thierry Moreau(hardware stack), Ziheng Jiang†(graph compilation), Haichen Shen(gpu optimization)&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Advisors: Luis Ceze, Carlos Guestrin, Arvind Krishnamurthy&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;DMLC open-source community&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;†Amazon Web Service&lt;/p&gt;
+
+&lt;p&gt;Deep learning has become ubiquitous and indispensable.  Part of this revolution has been fueled by scalable deep learning systems, such as TensorFlow, MXNet, Caffe and PyTorch. Most existing systems are optimized for a narrow range of server-class GPUs, and require significant effort be deployed on other platforms such as mobile phones, IoT devices and specialized accelerators (FPGAs, ASICs). As the number of deep learning frameworks and hardware backends increase, we propose a  [...]
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/gap.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;We are excited to announce the launch of TVM as solution to this problem. TVM is  a novel framework that can:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Represent and optimize the common deep learning computation workloads for CPUs, GPUs and other specialized hardware&lt;/li&gt;
+  &lt;li&gt;Automatically transform the computation graph to minimize memory utilization, optimize data layout and fuse computation patterns&lt;/li&gt;
+  &lt;li&gt;Provide an end-to-end compilation from existing front-end frameworks down to bare-metal hardware, all the way up to browser executable javascripts.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;With the help of TVM, we can easily run deep learning workloads on mobile phones, embedded devices and even the browser with little additional effort. TVM also provides a unified optimization framework for deep learning workloads on a multitude of hardware platforms, including specialized accelerators that rely on novel computational primitives.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/end_to_end_stack.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We adopt a common philosophy from the compiler community and provide two intermediate representation layers to efficiently lower high-level deep learning algorithms down to a multitude of hardware back-ends.&lt;/p&gt;
+
+&lt;p&gt;In today’s release, we open-source TVM package that contains optimization primitives for x86, ARM, OpenCL, Metal, CUDA and Javascript. We are actively working on adding support for specialized hardware acceleration and Nvidia’s GEMM-optimized Volta architecture.&lt;/p&gt;
+
+&lt;h2 id=&quot;technical-details&quot;&gt;Technical Details&lt;/h2&gt;
+
+&lt;p&gt;The goal of TVM stack is to provide a reusable toolchain to compile high-level neural network descriptions from deep learning framework frontends down to low-level machine code for multiple hardware backends. Take Apache MXNet as a front-end example, the following code snippet demonstrates how can TVM be used to compile a high-level description of a deep learning model to an optimized executable module tailored to the target hardware.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/code_highlevel.png&quot; alt=&quot;image&quot; width=&quot;800px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The challenge lies in enabling support for multiple hardware back-ends while keeping compute, memory and energy footprints at their lowest. We borrow wisdom from the compiler community in order to bridge the gap between the multitude of deep learning frameworks and hardware back-ends: we build a two-level intermediate layer composed of NNVM, a high-level intermediate representation (IR) for task scheduling and memory management, and TVM, an expressive low-level IR for optimizing [...]
+
+&lt;p&gt;The first level of the stack is a computational graph based representation. A computation graph is a directed acyclic graph that represent computation as nodes and dataflow dependency as edges. This representation is very powerful: it allows us to bake operation attributes into the computation graph and specify transformation rules to iteratively optimize a computation graph. This is a common approach taken by most of the existing deep learning frameworks, including the NNVM gra [...]
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/computational_graph.png&quot; alt=&quot;image&quot; width=&quot;300px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;A lot of powerful optimizations can be supported by the graph optimization framework. For example, we provided a sublinear memory optimization functionality that allows user to train 1000 layers of ImageNet ResNet on a single GPU.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/nnvm_gap.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;However, we find that the computational graph based IR alone is not enough to solve the challenge of supporting different hardware backends. The reason being that a single graph operator like convolution or matrix multiplication may be mapped and optimized in very different ways for each hardware back-end. These hardware-specific optimizations can vary drastically in terms of memory layout, parallelization threading patterns, caching access patterns and choice of hardware primit [...]
+
+&lt;p&gt;We build a low level representation to solve this problem. This representation is based on index formula, with additional support for recurrence computation.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_dsl.png&quot; alt=&quot;image&quot; width=&quot;700px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The low level IR adopt principles from existing image processing languages like Halide or darkroom to formulate an expressive deep learning DSL. TVM builds low level optimizations inspired by loop transformation tools like loopy and polyhedra-based analysis. We also draw inspiration from the dataflow description languages used in deep learning frameworks like MXNet, TensorFlow, Theano. The algorithms described in TVM are then processed in a scheduling phase to apply transformati [...]
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_backends.png&quot; alt=&quot;image&quot; width=&quot;600px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM includes standard transformation primitives commonly found in CPU optimization frameworks. More importantly, TVM incorporates novel optimization primitives targeted at GPUs, by exploiting thread cooperation patterns, data layout transformations and powerful new compute primitives. Using TVM in combination with NNVM provides an rich opportunity to optimize deep learning workloads across the software stack, enabling joint compute graph-level and operator-level optimizations.&l [...]
+
+&lt;h3 id=&quot;multi-language-and-platform-support&quot;&gt;Multi-language and Platform Support&lt;/h3&gt;
+
+&lt;p&gt;One of the many strength of TVM lies in its rich support for multiple platforms and languages. We present two components of the framework: the compiler stack which contains complete optimization libraries to produce optimized machine code, and the runtime which is lightweight and offers the portability required to deploy the compiled modules on different platforms.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_flexible.png&quot; alt=&quot;image&quot; width=&quot;600px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM currently support a python and C++ interface to the embedded compiler stack. We design the framework with maximum re-use in mind, so that the compiler stack improvements can be applied interchangeably between the Python and C++ components.&lt;/p&gt;
+
+&lt;p&gt;We also provide a lightweight runtime that can directly run TVM compiled code in languages such as javascript, java, python, and c++ on platforms including android, iOS, raspberry pi and web browsers.&lt;/p&gt;
+
+&lt;h3 id=&quot;remote-deployment-and-execution&quot;&gt;Remote Deployment and Execution&lt;/h3&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_rpc.png&quot; alt=&quot;image&quot; width=&quot;500px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM supports cross-compilation for and testing embedded devices with TVM RPC, a lightweight interface to deploy and execute TVM cross-compiled modules on a remote embedded device. This provides a familiar high-level Python interface to the TVM user to compile, optimize and test deep learning algorithms remotely on various low-level embedded devices.&lt;/p&gt;
+
+&lt;h2 id=&quot;performance&quot;&gt;Performance&lt;/h2&gt;
+
+&lt;p&gt;TVM is still in an early stage of development and we can expect more improvements to come, but we have started to see very promising results, which are discussed in this section.&lt;/p&gt;
+
+&lt;p&gt;TVM gives us the flexibility to explore the rich optimization space of various deep learning kernels, for multiple hardware platforms. For instance, TVM allows us to tailor data layout and fused pattern requirements for the kernels and platforms that we most care about.  Please note that the baseline libraries are created for more general purpose problems, while TVM’s optimized kernels are heavily tuned for the workloads we evaluated via an auto-tuning process. TVM serves as a b [...]
+
+&lt;p&gt;The results listed in this section are still work in progress, and there is room for improvement.&lt;/p&gt;
+
+&lt;h3 id=&quot;raspberry-pi&quot;&gt;Raspberry Pi&lt;/h3&gt;
+
+&lt;p&gt;In the first part of result we compared the TVM CPU schedule to nnpack on a raspberry Pi 3B executing a resnet workload. Due to limited time, we utilized TVM to implemented the direct convolution while nnpack was used to perform winograd conv for 3x3 kernels.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/resnet_rasp.png&quot; alt=&quot;image&quot; width=&quot;500px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We can find that with TVM’s autotuned kernels, we can obtain performance similar to the hand-optimized kernels in nnpack for the raspberry pi experiments.&lt;/p&gt;
+
+&lt;h3 id=&quot;gpu-results&quot;&gt;GPU Results&lt;/h3&gt;
+&lt;p&gt;&lt;strong&gt;Author Credit&lt;/strong&gt; These benchmarks and corresponding schedule optimizations are created by our contributors:  &lt;a href=&quot;http://www.ece.ucdavis.edu/~laurawly/&quot;&gt;Leyuan Wang&lt;/a&gt; (AWS / UCDavis), &lt;a href=&quot;http://huyuwei.github.io&quot;&gt;Yuwei Hu&lt;/a&gt;(TuSimple) and Weitang Liu (AWS/ UCDavis). They deserve all the credits.&lt;/p&gt;
+
+&lt;p&gt;As a proof of concept, we created an end to end compilation pipeline that can compile MxNet models down to TVM execution graphs. We apply optimization within and between graph nodes by automatically fusing operators together and letting TVM generate the fused kernels.
+We benchmarked the mobilenet ImageNet workload, and discuss the results below:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/gpu_mobilenet.png&quot; alt=&quot;image&quot; width=&quot;600px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We can find that TVM can outperform our baseline method in terms of speed. More interestingly, the kernel fusion brings additional speedup. It is worth mentioning that TVM generates all the optimized GPU kernels on its own without relying on external libraries like CuDNN.&lt;/p&gt;
+
+&lt;p&gt;We are working on more experiments and will release new results as they are obtained.&lt;/p&gt;
+
+&lt;h2 id=&quot;open-source-effort&quot;&gt;Open Source Effort&lt;/h2&gt;
+&lt;p&gt;TVM started as a research project of Paul G. Allen School Computer Science and Engineering at University of Washington. The TVM stack is designed to support &lt;a href=&quot;https://github.com/dmlc/dlpack&quot;&gt;DLPack&lt;/a&gt;, a consensus on tensor data structure by multiple major deep learning frameworks. We have received early contributions from from UW, AWS, Qiho 360, Facebook, HKUST, TuSimple, UCDavis, SJTU as well members of DMLC open-source community and DLPack initia [...]
+
+&lt;h2 id=&quot;acknowledgement&quot;&gt;Acknowledgement&lt;/h2&gt;
+&lt;p&gt;This project wouldn’t become possible without our early contributors. We would like to thank Yizhi Liu(Qihoo 360), Yuwei Hu(TuSimple),
+Xingjian Shi(HKUST), Leyuan Wang(AWS/UCDavis), Nicolas Vasilache(Facebook), Jian Weng(UCLA), Weitang Liu(AWS/UCDavis), Edward Z. Yang(Facebook),
+Lianmin Zheng(SJTU), Qiao Zhang(UW), William Moses(Facebook/MIT) and Hu Shiwen. The author would also like to thank Xianyi Zhang(PerfXLab) for helpful discussions.&lt;/p&gt;
+
+&lt;p&gt;We also learnt a lot from the following projects when building TVM.&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/halide/Halide&quot;&gt;Halide&lt;/a&gt;: TVM uses &lt;a href=&quot;https://github.com/dmlc/HalideIR&quot;&gt;HalideIR&lt;/a&gt; as data structure for
+arithematic simplification and low level lowering. HalideIR is derived from Halide.
+We also learns from Halide when implementing the lowering pipeline in TVM.&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/inducer/loopy&quot;&gt;Loopy&lt;/a&gt;: use of integer set analysis and its loop transformation primitives.&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/Theano/Theano&quot;&gt;Theano&lt;/a&gt;: the design inspiration of symbolic scan operator for recurrence.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;source-code&quot;&gt;Source code&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;Github page can be found here: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;TVM is &lt;a href=&quot;https://github.com/dmlc/dlpack&quot;&gt;DLPack&lt;/a&gt; compatible, which makes it easy to support frameworks
+that adopts the standard, such as MXNet, PyTorch, Caffe2 and tiny-dnn.&lt;/li&gt;
+&lt;/ul&gt;
+</content>
+ </entry>
+ 
+ 
+</feed>
diff --git a/blog.html b/blog.html
index 9811424..4ed0041 100644
--- a/blog.html
+++ b/blog.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li class="active"><a href="https://tvm.ai/blog" class="active">Blog</a></li>
+      	<li class="active"><a href="https://tvm.apache.org/blog" class="active">Blog</a></li>
       	
       
     
diff --git a/about.html b/categories.html
similarity index 51%
copy from about.html
copy to categories.html
index 749691f..66fa3f4 100644
--- a/about.html
+++ b/categories.html
@@ -3,8 +3,8 @@
 <html lang="en">
   <head>
     <meta charset="utf-8">
-    <title>About</title>
-    <meta name="description" content="TVM">
+    <title>Categories</title>
+    
     <meta name="author" content="">
 
     <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/community">Community</a></li>
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li class="active"><a href="https://tvm.ai/about" class="active">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
@@ -135,26 +135,21 @@
     <div class="row">
       <div class="span14">
         
-<h1 id="about-tvm">About TVM</h1>
 
-<p>TVM is an open deep learning compiler stack for CPUs, GPUs, and specialized accelerators. It aims to close the gap between the productivity-focused deep learning frameworks,
-and the performance- or efficiency-oriented hardware backends. TVM provides the following main features:</p>
+<ul class="tag_box inline">
+  
+  
+
+
+  
+    
+  
+
 
-<ul>
-  <li>Compilation of deep learning models in Keras, MXNet, PyTorch, Tensorflow, CoreML, DarkNet into minimum deployable modules on diverse hardware backends.</li>
-  <li>Infrastructure to automatic generate and optimize tensor operators
-on more backend with better performance.</li>
 </ul>
 
-<p>TVM stack began as a research project at the <a href="https://sampl.cs.washington.edu/">SAMPL group</a> of
-Paul G. Allen School of Computer Science &amp; Engineering, University of Washington. The project is now driven by an open source community involving multiple industry and academic institutions.
-The project adopts <a href="https://docs.tvm.ai/contribute/community.html">Apache-style merit based governace model</a>.</p>
 
-<p>TVM provides two level optimizations show in the following figure.
-Computational graph optimization to perform tasks such as high-level operator fusion, layout transformation, and memory management.
-Then a tensor operator optimization and code generation layer that optimizes tensor operators. More details can be found at the <a href="https://arxiv.org/abs/1802.04799">techreport</a>.</p>
 
-<p style="text-align: center"><img src="/images/main/tvm-stack.png" alt="image" width="80%" /></p>
 
       </div>
     </div>
diff --git a/community.html b/community.html
index 7bac130..b8da3a3 100644
--- a/community.html
+++ b/community.html
@@ -13,8 +13,8 @@
     <![endif]-->
 
     <!-- Le styles -->
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
-    <link href="https://tvm.ai/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
 
     <!-- Le fav and touch icons -->
   <!-- Update these with your own images
@@ -84,7 +84,7 @@
     
       
       	
-      	<li class="active"><a href="https://tvm.ai/community" class="active">Community</a></li>
+      	<li class="active"><a href="https://tvm.apache.org/community" class="active">Community</a></li>
       	
       
       
@@ -93,7 +93,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/about">About</a></li>
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
       	
       
       
@@ -102,7 +102,7 @@
     
       
       	
-      	<li><a href="https://tvm.ai/vta">VTA</a></li>
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
       	
       
       
@@ -112,7 +112,7 @@
       
       
       	
-      	<li><a href="https://tvm.ai/blog">Blog</a></li>
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
       	
       
     
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..fe78307
--- /dev/null
+++ b/index.html
@@ -0,0 +1,159 @@
+
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>TVM Stack</title>
+    <meta name="description" content="TVM">
+    <meta name="author" content="">
+
+    <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
+    <!--[if lt IE 9]>
+      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
+    <![endif]-->
+
+    <!-- Le styles -->
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="https://tvm.apache.org/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+
+    <!-- Le fav and touch icons -->
+  <!-- Update these with your own images
+    <link rel="shortcut icon" href="images/logo/tvm-logo.png">
+  <link rel="shortcut icon" href="images/logo/tvm-logo.png">
+  -->
+  <link href="/images/logo/tvm-logo-square.png" rel="icon" type="image/png"/>
+  <!-- Global site tag (gtag.js) - Google Analytics -->
+  <script async src="https://www.googletagmanager.com/gtag/js?id=UA-75982049-2"></script>
+  <script>
+    window.dataLayer = window.dataLayer || [];
+    function gtag(){dataLayer.push(arguments);}
+
+    gtag('js', new Date());
+    gtag('config', 'UA-75982049-2');
+  </script>
+
+</head>
+
+  <body>
+    <div class="topbar">
+      <div class="fill">
+        <div class="container">
+          <h2 id="logo-wrap">
+            <a href="/" class="nav">
+              <img src="/images/logo/tvm-logo-small-black.png" width="100px">
+            </a>
+          </h2>
+          <ul class="nav" id="nav-bar">
+            
+            
+            
+
+
+
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      	
+      	<li><a href="https://tvm.apache.org/community">Community</a></li>
+      	
+      
+      
+    
+  
+    
+      
+      	
+      	<li><a href="https://tvm.apache.org/about">About</a></li>
+      	
+      
+      
+    
+  
+    
+      
+      	
+      	<li><a href="https://tvm.apache.org/vta">VTA</a></li>
+      	
+      
+      
+    
+  
+    
+      
+      
+      	
+      	<li><a href="https://tvm.apache.org/blog">Blog</a></li>
+      	
+      
+    
+  
+
+
+
+
+            <li> <a href="https://sampl.cs.washington.edu/tvmconf">TVM Conference</a></li>
+            <li> <a href="https://docs.tvm.ai/tutorials/">Tutorials</a></li>
+            <li> <a href="https://docs.tvm.ai">Docs</a></li>
+            <li> <a href="https://github.com/dmlc/tvm/">Github</a></li>
+          </ul>
+        </div>
+      </div>
+    </div>
+    
+
+<div id="tvm-banner-container">
+  <div id="tvm-banner-title">
+    <p>
+      End to End Deep Learning Compiler Stack
+    </p>
+    <div id="tvm-banner-subtitle">
+      <p>
+        for CPUs, GPUs and specialized accelerators
+      </p>
+    </div>
+    <p>
+      <a href="/about" class="link-btn" id="banner-btn">Learn More</a>
+    </p>
+  </div>
+
+</div>
+
+
+
+    
+
+
+
+  </body>
+</html>
+
diff --git a/rss.xml b/rss.xml
new file mode 100644
index 0000000..7fe7674
--- /dev/null
+++ b/rss.xml
@@ -0,0 +1,3606 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+<channel>
+        <title>TVM</title>
+        <description>TVM - </description>
+        <link>https://tvm.apache.org</link>
+        <atom:link href="https://tvm.apache.org" rel="self" type="application/rss+xml" />
+        <lastBuildDate>Mon, 25 Nov 2019 21:33:32 -0800</lastBuildDate>
+        <pubDate>Mon, 25 Nov 2019 21:33:32 -0800</pubDate>
+        <ttl>60</ttl>
+
+
+        <item>
+                <title>Integrating TVM into PyTorch</title>
+                <description>&lt;p&gt;As TVM continuously demonstrates improvements to the efficiency of deep learning execution,
+it has become clear that PyTorch stands to benefit from directly leveraging the compiler stack.
+A major tenet of PyTorch is providing seamless and robust integrations that don’t get in the user’s way.
+To that end, PyTorch now has an official TVM-based backend, &lt;a href=&quot;https://github.com/pytorch/tvm&quot;&gt;torch_tvm&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;Usage is simple:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
+torch_tvm.enable()
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;That’s it!  PyTorch will then attempt to convert all operators it can to known Relay operators during its JIT compilation process.&lt;/p&gt;
+
+&lt;h3 id=&quot;background&quot;&gt;Background&lt;/h3&gt;
+
+&lt;p&gt;Unlike many other ML frameworks, PyTorch exposes an eager-execution programming interface.  This style of programming avoids graph-based meta-programming and focuses on the direct manipulation of n-dimensional arrays (tensors) in a Pythonic way.  As such, the framework was initially well suited for the experimentation and development of models, but not for automatic performance optimization or deployment.  To leverage optimizing compiler techniques, some large changes were recen [...]
+
+&lt;p&gt;&lt;img src=&quot;https://i.imgur.com/4XVHbJE.png&quot; alt=&quot;TVM Integration&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;PyTorch 1.0 introduced PyTorch IR, a PyTorch-specific intermediate representation for models similar to Relay.  PyTorch programs can be converted into the IR via model tracing, which records the execution of a model or TorchScript, a subset of Python.  The new TVM backend lowers PyTorch IR to Relay, and is able to transparently improve PyTorch performance with little user involvement.&lt;/p&gt;
+
+&lt;h3 id=&quot;integration-and-results&quot;&gt;Integration and Results&lt;/h3&gt;
+
+&lt;p&gt;To support Relay, two features were added to the PyTorch JIT: custom transformation passes and custom subgraph interpreters.&lt;/p&gt;
+
+&lt;p&gt;When &lt;code class=&quot;highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; is enabled, subgraphs of PyTorch IR that can be converted to Relay &lt;code class=&quot;highlighter-rouge&quot;&gt;Expr&lt;/code&gt;s will be marked as Relay-compatible.  Since PyTorch IR does not always contain shape information, none of the subgraphs can be compiled in a useful way before invocation.&lt;/p&gt;
+
+&lt;p&gt;During user invocation, the PyTorch JIT runtime will determine input shape information and compile the previously marked subgraphs with the new Relay C++ &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/torch_tvm/compiler.cpp#L226-L246&quot;&gt;build system&lt;/a&gt;.  The compilation is cached based on input shapes for subsequent runs.  More details can be found in the &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/README.md&quot;&gt;README&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; has a continuous benchmark system set up, which is monitoring the performance of ResNet18 on CPU.
+Out of the box TVM provides over two times the performance of the default PyTorch JIT backend for various ResNet models.
+Below is a graph that details the iterations per second achieved with 16 threads on an AWS c5n.4xlarge instance (larger is better):&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;https://i.imgur.com/KfJ7oas.png&quot; alt=&quot;bench&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;These results are quite encouraging, and the project will continue to focus on improving CPU inference speed across more models.&lt;/p&gt;
+
+&lt;h3 id=&quot;future-work&quot;&gt;Future work&lt;/h3&gt;
+
+&lt;p&gt;Right now the PyTorch JIT does a lot of work to find pure functional subsets of its IR to feed to Relay.  This avoids the need to map aliasing and control flow information to Relay, but is not necessary.  Mapping more of the PyTorch IR to Relay may yield performance wins and is a goal of the project.  PyTorch IR is rapidly changing as it is being developed, so this must be done carefully.&lt;/p&gt;
+
+&lt;p&gt;More work will be done to ensure the hand off between PyTorch and TVM code is efficient.  This includes unifying the threading model, allocators and reducing the overhead associated with copying inputs into TVM.&lt;/p&gt;
+
+&lt;h3 id=&quot;tutorial&quot;&gt;Tutorial&lt;/h3&gt;
+
+&lt;p&gt;If you have an already written PyTorch model, the easiest way to get started comes from using &lt;code class=&quot;highlighter-rouge&quot;&gt;torch.jit.trace&lt;/code&gt; as follows&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
+from your_model import model, inputs
+
+torch_tvm.enable(opt_level=3)
+
+iters = 100
+warmup = 10
+
+# Ensure your model is in eval mode and also turn off gradients.
+with torch.no_grad():
+  # Use tuned parameters for better performance.
+  with autotvm.apply_history_best(&quot;test/autotvm_tuning.log&quot;):
+    # This is where all the compilation happens.
+    trace_tvm = torch.jit.trace(model, inputs)
+    
+    # Warmup
+    for _ in range(warmup):
+      _ = trace_tvm(*inputs)
+
+    # Benchmark
+    start = time.time()
+    for _ in range(iters):
+      _ = trace_tvm(*inputs)
+    tvm_time = time.time() - start
+    
+    print(&quot;Took {}s to run {} iters&quot;.format(tvm_time, iters))
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Much of this code comes from &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/test/benchmarks.py&quot;&gt;benchmarks.py&lt;/a&gt;.  Note that tuned parameters for AVX2 LLVM compilation is in the &lt;code class=&quot;highlighter-rouge&quot;&gt;test/&lt;/code&gt; folder of the repo.&lt;/p&gt;
+
+&lt;p&gt;If you are more comfortable using Relay directly, it is possible to simply extract the expression directly from a
+PyTorch function either via (implicit) tracing or TorchScript:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;def add(a, b, c):
+    return a + b + c
+
+# via tracing
+relay_graph = torch_tvm.to_relay(add, inputs)
+
+@torch.jit.script
+def mul(a, b, c):
+    return a * b * c
+
+# via script
+relay_graph = torch_tvm.to_relay(mul, inputs)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+</description>
+                <link>https://tvm.apache.org/2019/05/30/pytorch-frontend</link>
+                <guid>https://tvm.apache.org/2019/05/30/pytorch-frontend</guid>
+                <pubDate>Thu, 30 May 2019 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>Automating Optimization of Quantized Deep Learning Models on CUDA</title>
+                <description>&lt;p&gt;Deep learning has been successfully applied to a variety of tasks.
+On real-time scenarios such as inference on autonomous vehicles, the inference speed of the model is critical.
+Network quantization is an effective approach to accelerating deep learning models.
+In quantized models, both data and model parameters are represented with low precision data types such as &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;float16&lt;/code&gt;.
+The lowered data bandwidth reduces the inference time and memory/storage requirements, as well as the power consumption.
+Meanwhile, under proper quantization schemes, we can minimize the accuracy drops of the quantized models.
+Therefore, quantized models are of particular interests of researchers and developers as it makes large models suitable to deploy on diverse devices, such as GPU, CPU and mobile devices.&lt;/p&gt;
+
+&lt;p&gt;Previously, quantized operators are usually optimized with handcrafted microkernels for different workloads, or rely on blackbox proprietary solutions such as cuDNN and TensorRT.
+Writing a high-performance microkernel in assembly can be very challenging and usually requires heavy engineering effort.
+Besides, it is difficult to adapt these ad-hoc microkernels to emerging workloads and new devices.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/cuda-quantized/benchmark.svg&quot; alt=&quot;image&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 1. Inference time of different models on TVM, TensorRT, and MXNet &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM solves this challenge with a full stack compiler and a machine-learning-based optimizer to automatically generate computing kernels.
+TVM can generate efficient kernels via automatic search in a human-designed search space.
+In standard workloads such as VGG and ResNet, TVM achieves competitive performance compared with other state-of-the-art frameworks. 
+In emerging models such as ResNeXt and Deformable ConvNets, the automatic optimization makes it easy for TVM to adapt to these new workloads and achieve a significant performance boost.&lt;/p&gt;
+
+&lt;p&gt;In this post, we show how to use TVM to automatically optimize of quantized deep learning models on CUDA.&lt;/p&gt;
+
+&lt;h1 id=&quot;expressing-quantized-cuda-kernels-in-tvm&quot;&gt;Expressing Quantized CUDA Kernels in TVM&lt;/h1&gt;
+&lt;h2 id=&quot;leveraging-tensor-intrinsics-via-tensorization&quot;&gt;Leveraging Tensor Intrinsics via Tensorization&lt;/h2&gt;
+&lt;p&gt;Many platforms provide architecture-specific instructions for special computation patterns, for example, the SIMD instructions on x86, and the &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;hfma&lt;/code&gt; instructions on CUDA.
+These intrinsic instructions are highly optimized for specific devices.
+By leveraging hardware intrinsics, we can achieve a significant performance boost for quantized operators.&lt;/p&gt;
+
+&lt;p&gt;Currently, &lt;a href=&quot;https://devblogs.nvidia.com/mixed-precision-programming-cuda-8/&quot;&gt;dp4a&lt;/a&gt; has been extensively used in TVM int8 operators on CUDA.
+&lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; is a CUDA intrinsic on Compute Capability 6.1 devices.
+It is a mixed-precision instruction that provides the efficient computation of the dot product between two 4-element 8-bit integer vectors and accumulates the result in 32-bit format.
+Using &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, we can implement a dot product between 8-bit integer vectors with number of elements evenly divisible by four.
+With an efficient dot product operator, we can implement high-level operators such as 2d convolution and dense layers as these operators are commonly backed by dot products.&lt;/p&gt;
+
+&lt;p&gt;To illustrate, in 2d convolution we accumulate along the channel, the width, and the height axis of the kernel.
+This is a typical use case of &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;.
+TVM uses tensorization to support calling external intrinsics.
+We do not need to modify the original computation declaration; we use the schedule primitive &lt;code class=&quot;highlighter-rouge&quot;&gt;tensorize&lt;/code&gt; to replace the accumulation with &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; tensor intrinsic.
+More details of tensorization can be found in the &lt;a href=&quot;https://docs.tvm.ai/tutorials/language/tensorize.html&quot;&gt;tutorial&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h2 id=&quot;data-layout-rearrangement&quot;&gt;Data Layout Rearrangement&lt;/h2&gt;
+&lt;p&gt;One of the challenges in tensorization is that we may need to design special computation logic to adapt to the requirement of tensor intrinsics.
+Although it is natural to accumulate along the inner axis of the tensor in the dense operator, &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; can be more challenging.
+In &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; we expect to take a slice in the channel dimension as the input of &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; because the number of channels is typically multiple of 4 (otherwise we fall back to original &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; in NCHW layout).
+Meanwhile, to achieve memory locality, we would like to reduce along the innermost axis first.
+Taking these factors into account, we use a custom data layout to address this challenge.&lt;/p&gt;
+
+&lt;p&gt;In CUDA int8 2d convolution, we empirically choose &lt;code class=&quot;highlighter-rouge&quot;&gt;NCHW4c&lt;/code&gt; as data layout and &lt;code class=&quot;highlighter-rouge&quot;&gt;OIHW4o4i&lt;/code&gt; as weight layout.
+The templates can also be easily generalized to &lt;code class=&quot;highlighter-rouge&quot;&gt;NCHW[x]c&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;OIHW[x]o[x]i&lt;/code&gt;, where x is an arbitrary positive integer divisible by four.
+In the data layout we choose, slices of channels are in the packed innermost dimension.
+Likewise, we pack slices in both the input and output channel dimensions of the weight so that the output has a consistent data layout with the input, which prevents redundant layout transformations between layers.&lt;/p&gt;
+
+&lt;p&gt;We show the computation of one element of the output of the 2d convolution in Figure 2.
+The element in each position of the super dimension (the outer dimension of the blocked layout which contains packed elements) NCHW and OIHW is the packed input and kernel, respectively.
+Each column of the packed kernel comes from a different filter.
+We calculate the dot product between the packed input and each row in the packed kernel using &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, and accumulate the result to the output tensor.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/cuda-quantized/conv2d.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+&lt;div&gt;
+Figure 2. 2D convolution with data layout in NCHW4c and weight layout in OIHW4o4i.
+&lt;b&gt;Left&lt;/b&gt;: The input tensor in NCHW4c layout. One moving filter of the kernel is colored in blue. One element of the input and kernel is colored in grey. 
+&lt;b&gt;Mid&lt;/b&gt;: The packed input and kernel in the grey block.
+&lt;b&gt;Right&lt;/b&gt;: The output in NCHW4c layout. Inside the one element depicted, there are four packed elements in channel sub-dimension.
+&lt;/div&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;After we have specified the layout of convolution layers, other operators such as &lt;code class=&quot;highlighter-rouge&quot;&gt;add&lt;/code&gt; and activations can automatically adapt to the chosen layout during the &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/src/relay/pass/alter_op_layout.cc&quot;&gt;AlterOpLayout&lt;/a&gt; pass in Relay.
+The layout transformation of the weight can be precomputed offline. Therefore, we can run the whole model in the same layout without extra overhead.&lt;/p&gt;
+
+&lt;h2 id=&quot;designing-search-space-for-automatic-optimization&quot;&gt;Designing Search Space for Automatic Optimization&lt;/h2&gt;
+&lt;p&gt;The key to achieving good performance in our quantized operators is to integrate with machine-learning-based automatic optimization. One question is how to design an effective schedule search space.
+An effective schedule template means that we can obtain good performance in a reasonable number of iterations in automatic tuning.
+Generally speaking, we strive to define a flexible template to cover different configurations in the search space.
+On the other hand, we also take advantage of the prior knowledge in performance optimization.
+For example, as caching data in the shared memory is a common practice in CUDA programming, we utilize shared memory, but we use machine learning to choose the best tile size.
+We also do some manual tiling such as splitting axes by 4 or 16 to facilitate vectorized memory access.&lt;/p&gt;
+
+&lt;p&gt;In quantized 2d convolution, we design a search space that includes a set of tunable options, such as the tile size, the axes to fuse, configurations of loop unrolling and double buffering.
+The templates of quantized &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;dense&lt;/code&gt; on CUDA are registered under template key &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt;.
+During automatic tuning, we can create tuning tasks for these quantized operators by setting the &lt;code class=&quot;highlighter-rouge&quot;&gt;template_key&lt;/code&gt; argument.
+Details of how to launch automatic optimization can be found in the &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_relay_cuda.html&quot;&gt;AutoTVM tutorial&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h1 id=&quot;general-workflow&quot;&gt;General Workflow&lt;/h1&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/cuda-quantized/workflow.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 3. Workflow of running quantized models &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM provides an easy workflow to quantize trained models from other frameworks, automatically optimize operators (with AutoTVM), and deploy to different devices.&lt;/p&gt;
+
+&lt;p&gt;First, we use the Relay frontend to import existing models. Here we use an MXNet model with &lt;code class=&quot;highlighter-rouge&quot;&gt;(1, 3, 224, 224)&lt;/code&gt; input shape as an example.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg_params&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;aux_params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&l [...]
+&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Next, we use the relay quantization API to convert it to a quantized model.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Then, we use AutoTVM to extract tuning tasks for the operators in the model and perform automatic optimization. The &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_relay_cuda.html&quot;&gt;AutoTVM tutorial&lt;/a&gt; provides an example for this.&lt;/p&gt;
+
+&lt;p&gt;Finally, we build the model and run inference in the quantized mode.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;opt_level&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt [...]
+    &lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;lib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;s [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;The result of &lt;code class=&quot;highlighter-rouge&quot;&gt;relay.build&lt;/code&gt; is a deployable library.
+We can either run inference &lt;a href=&quot;https://docs.tvm.ai/tutorials/frontend/from_mxnet.html#execute-the-portable-graph-on-tvm&quot;&gt;on the GPU&lt;/a&gt; directly or deploy &lt;a href=&quot;https://docs.tvm.ai/tutorials/frontend/deploy_model_on_rasp.html#deploy-the-model-remotely-by-rpc&quot;&gt;on the remote devices&lt;/a&gt; via RPC.&lt;/p&gt;
+
+&lt;h1 id=&quot;benchmark&quot;&gt;Benchmark&lt;/h1&gt;
+&lt;p&gt;To verify the performance of the quantized operators in TVM, we benchmark the performance of several popular network models including VGG-19, ResNet-50 and Inception V3.
+We also benchmark on DRN-C-26, ResNeXt-50, and DCN-ResNet-101 from &lt;a href=&quot;https://github.com/msracver/Deformable-ConvNets&quot;&gt;Deformable ConvNets&lt;/a&gt; to show the performance of emerging models, which contains less conventional operators such as dilated convolutions, group convolutions and deformable convolutions.
+We choose NVIDIA TensorRT as our baseline.
+The result of MXNet 1.4 + cuDNN 7.3 in float32 mode is reported to show the speed up of quantization.
+The experiments are conducted on NVIDIA GTX 1080.
+We report the inference time per image when running in batch size = 1 and 16.&lt;/p&gt;
+
+&lt;p&gt;As shown in the Figure 1, TVM achieves up to 8x speedup using quantization.
+In standard CNN models such as VGG and ResNet, TVM achieves parity with the state-of-the-art results from TensorRT.&lt;/p&gt;
+
+&lt;p&gt;When benchmarking emerging models, TVM achieves impressive results.
+We obtain significant performance gains on ResNeXt and DCN-ResNet-101.
+Results of DCN-ResNet-101 of TensorRT are not available because there is no official implementation of the deformable convolution.
+We show that automatic optimization in TVM makes it easy and flexible to support and optimize emerging workloads.&lt;/p&gt;
+
+&lt;h1 id=&quot;show-me-the-code&quot;&gt;Show Me the Code&lt;/h1&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/vinx13/tvm-cuda-int8-benchmark&quot;&gt;Benchmark&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/conv2d_int8.py&quot;&gt;CUDA int8 conv2d&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/group_conv2d_nchw.py&quot;&gt;CUDA int8 group conv2d&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/dense.py&quot;&gt;CUDA int8 dense&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/cuda/tensor_intrin.py&quot;&gt;Tensor intrinsics declaration&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;bio--acknowledgement&quot;&gt;Bio &amp;amp; Acknowledgement&lt;/h1&gt;
+&lt;p&gt;&lt;a href=&quot;https://wuwei.io/&quot;&gt;Wuwei Lin&lt;/a&gt; is an undergraduate student at SJTU. He is currently an intern at TuSimple. The author has many thanks to &lt;a href=&quot;https://homes.cs.washington.edu/~tqchen/&quot;&gt;Tianqi Chen&lt;/a&gt; and &lt;a href=&quot;https://homes.cs.washington.edu/~eqy/&quot;&gt;Eddie Yan&lt;/a&gt; for their reviews.&lt;/p&gt;
+</description>
+                <link>https://tvm.apache.org/2019/04/29/opt-cuda-quantized</link>
+                <guid>https://tvm.apache.org/2019/04/29/opt-cuda-quantized</guid>
+                <pubDate>Mon, 29 Apr 2019 09:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>TVM Deep Learning Compiler Joins Apache Software Foundation</title>
+                <description>&lt;p&gt;There is an increasing need to bring machine learning to a wide diversity of hardware devices. Current frameworks rely on vendor-specific operator libraries and optimize for a narrow range of server-class GPUs. Deploying workloads to new platforms – such as mobile phones, embedded devices, and accelerators (e.g., FPGAs, ASICs) – requires significant manual effort.&lt;/p&gt;
+
+&lt;p&gt;TVM is an open source deep learning compiler stack that closes the gap between the productivity-focused deep learning frameworks, and the performance- or efficiency-oriented hardware backends. Today, we are glad to announce that the TVM community has decided to move on to Apache incubator, and becomes an Apache(incubating) project.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/main/tvm-stack.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM stack began as a research project at the &lt;a href=&quot;https://sampl.cs.washington.edu/&quot;&gt;SAMPL group&lt;/a&gt; of Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington. The project uses the loop-level IR and several optimizations from the &lt;a href=&quot;http://halide-lang.org/&quot;&gt;Halide project&lt;/a&gt;, in addition to &lt;a href=&quot;https://tvm.ai/about&quot;&gt;a full deep learning compiler stack&lt;/a&gt; to support [...]
+
+&lt;p&gt;Since its introduction, the project was driven by an open source community involving multiple industry and academic institutions. Currently, the TVM stack includes a high-level differentiable programming IR for high-level optimization, a machine learning driven program optimizer and VTA – a fully open sourced deep learning accelerator. The community brings innovations from machine learning, compiler systems, programming languages, and computer architecture to build a full-stack  [...]
+
+&lt;p&gt;Besides the technical innovations, the community adopts an open, welcoming and neutral policy. The project is run by committers who are elected purely based on their merit of the contributions to the project. Besides the contributors from UW SAMPL, the community now has nearly 200 contributors that come from Amazon Web Services (AWS), Qualcomm, Facebook, Google, Huawei, AMD, Microsoft, Cornell University, University of California, Berkeley, and more.        The community success [...]
+
+&lt;p&gt;We would like to take this chance to thank the Allen School for supporting the SAMPL team that gave birth to the TVM project. We would also like to thank the Halide project which provided the basis for TVM’s loop-level IR and initial code generation. We would like to thank our Apache incubator mentors for introducing the project to Apache and providing useful guidance. Finally, we would like to thank the TVM community and all of the organizations, as listed above, that supported [...]
+
+&lt;p&gt;See also the &lt;a href=&quot;https://news.cs.washington.edu/2019/03/18/allen-schools-tvm-deep-learning-compiler-framework-transitions-to-apache/&quot;&gt;Allen School news about the transition here&lt;/a&gt;, &lt;a href=&quot;https://sampl.cs.washington.edu/tvmconf/#about-tvmconf&quot;&gt;TVM conference program slides and recordings&lt;/a&gt;, and &lt;a href=&quot;https://docs.tvm.ai/contribute/community.html&quot;&gt;our community guideline here&lt;/a&gt;. Follow us on Twitter [...]
+</description>
+                <link>https://tvm.apache.org/2019/03/18/tvm-apache-announcement</link>
+                <guid>https://tvm.apache.org/2019/03/18/tvm-apache-announcement</guid>
+                <pubDate>Mon, 18 Mar 2019 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>TVM Golang Runtime for Deep Learning Deployment</title>
+                <description>&lt;h2 id=&quot;introduction&quot;&gt;Introduction&lt;/h2&gt;
+
+&lt;p&gt;TVM is an open deep learning compiler stack to compile various deep learning models from different
+frameworks to CPU, GPU or specialized accelerators.  TVM supports model compilation from a wide range
+of front ends like Tensorflow, Onnx, Keras, Mxnet, Darknet, CoreML and Caffe2. TVM compiled modules
+can be deployed on backends like LLVM (Javascript or WASM, AMD GPU, ARM or X86), NVidia GPU (CUDA),
+OpenCL and Metal.&lt;/p&gt;
+
+&lt;p&gt;TVM supports runtime bindings for programming languages like Javascript, Java, Python, C++… and now Golang.
+With a wide range of frontend, backend and runtime bindings, TVM enables developers to integrate and
+deploy deep learning models from a variety of frameworks to a choice of hardware via many programming languages.&lt;/p&gt;
+
+&lt;p&gt;The TVM import and compilation process generates a graph JSON, a module and a params. Any application that
+integrates the TVM runtime can load these compiled modules and perform inference. A detailed tutorial of module
+import and compilation using TVM can be found at &lt;a href=&quot;https://docs.tvm.ai/tutorials/&quot;&gt;tutorials&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;TVM now supports deploying compiled modules through Golang. Golang applications can make use of this
+to deploy the deep learning models through TVM. The scope of this blog is the introduction of &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package,
+the package build process and a sample application using &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; to load a compiled module and perform inference.&lt;/p&gt;
+
+&lt;h2 id=&quot;package&quot;&gt;Package&lt;/h2&gt;
+
+&lt;p&gt;The golang package &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; is built on top of TVM’s C runtime interface. The API in this package
+abstracts the native C types and provides Golang compatible types. The package source can be found
+at &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/golang&quot;&gt;gotvm&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;This package leverages golang’s interface, slices, function closures and implicitly handles the
+necessary conversions across API calls.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/golang/TVM-Golang-Blog.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Golang Interface over TVM Runtime &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;how-to&quot;&gt;How to&lt;/h2&gt;
+
+&lt;p&gt;As shown in the below diagram &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; enables golang applications to integrate deep learning models
+from various frameworks without the hassle of understanding each framework related interface API.
+Developers can make use of TVM to import and compile deep learning models and generate TVM artifacts.
+&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides golang friendly API to load, configure, feed input and get output.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/golang/TVM-Golang-Flow.png&quot; alt=&quot;image&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Import, Compile, Integrate and Deploy&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM &lt;a href=&quot;https://docs.tvm.ai/tutorials/#compile-deep-learning-models&quot;&gt;Compile Deep Learning Models&lt;/a&gt; tutorials
+are available to compile models from all frameworks supported by the TVM frontend. This compilation process
+generates the artifacts required to integrate and deploy the model on a target.&lt;/p&gt;
+
+&lt;h2 id=&quot;api&quot;&gt;API&lt;/h2&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides a handful of datatypes and API functions to initialize, load and infer
+from a golang application. Like any other golang package we just need to import &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package here.&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Module : The Module API can be used to load a TVM compiled module into TVM runtime and access any functions.&lt;/li&gt;
+  &lt;li&gt;Value : The Value API provides helper functions to set arguments or get return values in golang types like basic types or slices.&lt;/li&gt;
+  &lt;li&gt;Function : The Function API is useful for getting handles to functions and invoking them.&lt;/li&gt;
+  &lt;li&gt;Array : The Array API is useful for setting and getting Tensor data via golang slice.&lt;/li&gt;
+  &lt;li&gt;Context : The Context API contains helper functions to build backend context handles.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;example&quot;&gt;Example&lt;/h2&gt;
+
+&lt;p&gt;A simple example with inline documentation of loading a compiled module and performing inference is shown below.
+For simplicity the error handling is ignored here, but is important in real applications.&lt;/p&gt;
+
+&lt;div class=&quot;language-cpp highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;
+&lt;span class=&quot;n&quot;&gt;package&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;main&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;// Import compiled gotvm package.&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;s&quot;&gt;&quot;./gotvm&quot;&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;// Some constants for TVM compiled model paths.&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;// modLib : Is the compiled library exported out of compilation.&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;// modJson : TVM graph JSON.&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;// modParams : Exported params out of TVM compilation process.&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;const&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modLib&lt;/span&gt;    &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;./libdeploy.so&quot;&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modJSON&lt;/span&gt;   &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;./deploy.json&quot;&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modParams&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;./deploy.params&quot;&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;// main&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;main&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+    &lt;span class=&quot;c1&quot;&gt;// Some util API to query underlying TVM and DLPack version information.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fmt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Printf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;TVM Version   : v%v&lt;/span&gt;&lt;span class=&quot;se&quot;&gt;\n&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
+    &lt;span class=&quot;n&quot;&gt;fmt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Printf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;DLPACK Version: v%v&lt;/span&gt;&lt;span class=&quot;se&quot;&gt;\n\n&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+
+    &lt;span class=&quot;c1&quot;&gt;// Import tvm module (so).&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;modp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;LoadModuleFromFile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;modLib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Load module on tvm runtime - call tvm.graph_runtime.create&lt;/span&gt;
+    &lt;span class=&quot;c1&quot;&gt;// with module and graph JSON.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ioutil&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ReadFile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;modJSON&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;jsonStr&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;string&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetGlobalFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;tvm.graph_runtime.create&quot;&lt;/span&gt;&lt;span cla [...]
+    &lt;span class=&quot;n&quot;&gt;graphrt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;jsonStr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt [...]
+    &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphrt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;AsModule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+
+
+    &lt;span class=&quot;c1&quot;&gt;// Allocate input &amp;amp; output arrays and fill some data for input.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;tshapeIn&lt;/span&gt;  &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int64&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;224&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;224&lt;/span&gt;&lt [...]
+    &lt;span class=&quot;n&quot;&gt;tshapeOut&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int64&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1001&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;inX&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tshapeIn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+    &lt;span class=&quot;n&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tshapeOut&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;make&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;244&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;244&lt;/span&gt [...]
+    &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Seed&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;10&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Shuffle&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;len&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;sp [...]
+                                               &lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;
+                                               &lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;})&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;inX&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;CopyFrom&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Load params&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ioutil&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ReadFile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;modParams&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;load_params&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bytes&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+
+    &lt;span class=&quot;c1&quot;&gt;// Set module input&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;set_input&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;input&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inX&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Run or Execute the graph&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;run&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt; [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Get output from runtime.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphmod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetFunction&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;get_output&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+    &lt;span class=&quot;n&quot;&gt;funp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Invoke&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int64&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// Access output tensor data.&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;outIntf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;AsSlice&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;outSlice&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;outIntf&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.([]&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;// outSlice here holds flattened output data as a golang slice.&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; extends the TVM packed function system to support golang function closures as packed functions.
+&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/golang/sample&quot;&gt;Examples&lt;/a&gt; available to register golang
+closure as TVM packed function and invoke the same across programming language barriers.&lt;/p&gt;
+
+&lt;h2 id=&quot;show-me-the-code&quot;&gt;Show me the code&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/golang/src&quot;&gt;Package Source&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/golang/sample&quot;&gt;Examples&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;[1] &lt;a href=&quot;https://golang.org&quot;&gt;Go Programming Lang&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[2] &lt;a href=&quot;https://blog.golang.org/godoc-documenting-go-code&quot;&gt;Go Documentation Guide Lines&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[3] &lt;a href=&quot;https://golang.org/pkg/testing&quot;&gt;Go Testcase Framework&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[4] &lt;a href=&quot;https://golang.org/cmd/cgo&quot;&gt;Go CFFI&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[5] &lt;a href=&quot;https://blog.learngoprogramming.com/golang-variadic-funcs-how-to-patterns-369408f19085&quot;&gt;Go Variadic Functions&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[6] &lt;a href=&quot;https://github.com/jdeng/gomxnet&quot;&gt;CFFI Ref&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[7] &lt;a href=&quot;https://golang.org/pkg/runtime/#SetFinalizer&quot;&gt;Go Finalizers&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</description>
+                <link>https://tvm.apache.org/2019/01/19/Golang</link>
+                <guid>https://tvm.apache.org/2019/01/19/Golang</guid>
+                <pubDate>Sat, 19 Jan 2019 00:00:00 -0800</pubDate>
+        </item>
+
+        <item>
+                <title>Automating Generation of Low Precision Deep Learning Operators</title>
+                <description>&lt;p&gt;As deep learning models grow larger and more complex, deploying them on low powered phone and IoT
+devices becomes challenging because of their limited compute and energy budgets. A  recent  trend
+ in  deep  learning  is  the  use  of  extremely  quantized  models  that operate  on  inputs  and
+ weights  of  a  few  bits, with networks like XNOR-Net, DoReFa-Net, and HWGQ-Net making steady
+progress improving accuracy.&lt;/p&gt;
+
+&lt;p&gt;An example of a low precision graph snippet is below. The low precision convolution takes in
+quantized data and bitpacks into the proper data layout for an efficient bitserial convolution.
+The output is in a higher precision and traditional deep learning layers such as batch normalization and ReLu are applied to it, before being re-quantized and sent through another low precision operator.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/workflow.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Low precision convolution pipeline.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Theoretically,  low  precision operators use less operations than
+floating point operators, leading many to believe they can achieve up tremendous speedups.
+However, deep  learning frameworks  leverage  decades  of  engineering  work  through  low  level
+BLAS  and LAPACK libraries that are incredibly well optimized, and CPUs include intrinsic
+instructions to accelerate these tasks.  In  practice,  it  is  not  simple  to  develop low-level
+operators such as convolutions  that  are competitive  with  8-bit  quantized  or  even floating
+point operators.
+In  this  post  we  introduce  our  approach to automatically generating optimized
+low  precision  convolutions for  CPUs. We declare our low precision operators so that they compute
+on efficiently stored low precision inputs, and describe a schedule that describes a search space
+of implementation parameters. We rely on AutoTVM to quickly search the space and find optimized
+parameters for the particular convolution, precision, and backend.&lt;/p&gt;
+
+&lt;h2 id=&quot;bitserial-computation-background&quot;&gt;Bitserial Computation Background&lt;/h2&gt;
+
+&lt;p&gt;The  core  of  low  precision  models  is  the bitserial dot product that enables convolution and
+dense operators to be computed using only bitwise operations and popcount.
+ Typically, a dot product is computed by element wise multiplication of two vectors followed by
+ summing all the elements, like the simple example below. If all the data is binary, the input
+ vectors can be packed into single integer, and the dot product can be computed by  bitwise-anding
+ the packed inputs and counting the number of 1’s in the result using popcount.
+Note: Depending how the input data is quantized, bitwise-xnor may be used instead of bitwise-and.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/binary-dotproduct.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Binary dot product.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Arbitrary precision dot products can be computed in this fashion by first separating input data
+into bitplanes. Once in this representation we can compute dotproduct by summing weighted binary
+dot products between the bitplanes of A and B. The number of binary dotproducts grows with the
+product of A and B’s precision, so this method is only practical for very low precision data.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/bitserial-dotproduct.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Bitserial dot product.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;defining-operators-in-tvm&quot;&gt;Defining Operators in TVM&lt;/h2&gt;
+&lt;p&gt;Before the computation, input data needs to be bitpacked so that the bitplanes of the input data
+can be accessed and are packed into a supported datatype such as a uint8 or uint32. We provide
+a flexible bitpacking operator that takes arbitrary size input tensors and returns a bitpacked
+tensor where the user specifies which axis the bitplanes should be.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/bitpack.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Different bitpacked layouts.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Once in this bitpacked format the low precision  convolution can be computed bitserially.
+For this demo, that data is packed along the input channel and the bitplanes are added to the
+innermost axis, and the data is packed into 32-bit integers. The bitserial convolution is computed
+similar to a normal convolution, but the bitwise-and (&amp;amp;) replaces multiplication, and we use
+popcount to accumulate values in the packed data. The bitplane axes become additional reduction axes
+and compute the binary dot products between different bitplanes of the input and kernel.
+Finally, the output is computed in an unpacked format and in higher precision.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;Input_bitpacked&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bitpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;acti [...]
+&lt;span class=&quot;n&quot;&gt;Weights_bitpacked&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bitpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pack_axis&lt;/span&gt;&lt;span class=&quot;o&quot;& [...]
+&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span& [...]
+&lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt [...]
+
+&lt;span class=&quot;n&quot;&gt;stride_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_left&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_down&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_right&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_pad_tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;( [...]
+
+&lt;span class=&quot;c1&quot;&gt;# Computing the output shape
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;out_channel&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_filter&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;simplify&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;in_height&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+ [...]
+&lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;simplify&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;in_width&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_left&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+& [...]
+&lt;span class=&quot;n&quot;&gt;pad_before&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_left&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;pad_after&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_down&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_right&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&l [...]
+&lt;span class=&quot;n&quot;&gt;Input_padded&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Input_bitpacked&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_before&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_after&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+
+&lt;span class=&quot;c1&quot;&gt;# Treat the bitplane axes like additional reduction axes
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&l [...]
+&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
+&lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
+&lt;span class=&quot;n&quot;&gt;ib&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt [...]
+&lt;span class=&quot;n&quot;&gt;wb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &l [...]
+
+
+&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; [...]
+             &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;popcount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+               &lt;span class=&quot;n&quot;&gt;Input_padded&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt; [...]
+               &lt;span class=&quot;n&quot;&gt;Weights_bitpacked&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ff&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
+               &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wb&lt;/span&gt;&lt;spa [...]
+
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;In our schedule we apply common optimizations like vectorization and memory tiling to provide better
+memory locality and take advantage of SIMD units. Some of these optimizations such as tiling,
+require parameters that need to be tuned to for the specific microarchitecture. We expose these
+parameters as knobs to TVM and use AutoTVM to automatically tune all the parameters simultaneously.&lt;/p&gt;
+
+&lt;p&gt;Finally, we can craft small microkernels to replace the innermost loop(s) of computation and schedule
+ them using TVM’s tensorize primitive. Since, compilers often produce suboptimal code, people can
+ often write short assembly sequences that are more efficient. These microkernels often take advantage
+ of new intrinsics that are being introduced to help accelerate deep learning workloads and use
+ them clever ways to improve memory accesses or reduce the number instructions required.&lt;/p&gt;
+
+&lt;h2 id=&quot;results&quot;&gt;Results&lt;/h2&gt;
+
+&lt;h3 id=&quot;raspberry-pi&quot;&gt;Raspberry Pi&lt;/h3&gt;
+&lt;p&gt;Convolution speedups on Raspberry Pi 3B compared to 16-bit integer TVM implementation.
+Workload are convolution layers from ResNet18.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/rasp-conv.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Speedup of low precision convolutions on a Raspberry Pi compared to 16-bit TVM implementation.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;2-bit activation, 1-bit weight convolution speedups on Raspberry Pi 3B compared to hand optimized implementation from &lt;a href=&quot;https://arxiv.org/pdf/1712.02427.pdf&quot;&gt;High performance ultra-low-precision convolutions
+on mobile devices.&lt;/a&gt;.
+Workload are convolution layers from ResNet18.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/rasp-conv-2.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Speedup of 2-bit weight 1-bit activation Raspberry Pi convolutions against a hand optimized implementation.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;x86&quot;&gt;x86&lt;/h3&gt;
+
+&lt;p&gt;Convolution speedups on x86 compared to a 32-bit floating point TVM implementation.
+Note: x86 doesn’t support a vectorized popcount for this microarchitecture, so speedups are lower.&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/low-precision/x86-conv.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Speedup of x86 low precision convolutions compared to a 32-bit floating point TVM implementation.&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;show-me-the-code&quot;&gt;Show me the code&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/nn/bitserial_conv2d.py&quot;&gt;TOPI bitserial convolution&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/python/topi/arm_cpu/bitserial_conv2d.py&quot;&gt;TOPI ARM cpu bitserial convolution&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;[1] &lt;a href=&quot;https://arxiv.org/abs/1810.11066&quot;&gt;Automating Generation of Low Precision Deep Learning Operators&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[2] &lt;a href=&quot;https://arxiv.org/abs/1603.05279&quot;&gt;XNOR-Net&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[3] &lt;a href=&quot;https://arxiv.org/abs/1702.00953&quot;&gt;HWGQ&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;[4] &lt;a href=&quot;https://arxiv.org/abs/1606.06160&quot;&gt;DoReFa&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+</description>
+                <link>https://tvm.apache.org/2018/12/18/lowprecision-conv</link>
+                <guid>https://tvm.apache.org/2018/12/18/lowprecision-conv</guid>
+                <pubDate>Tue, 18 Dec 2018 00:00:00 -0800</pubDate>
+        </item>
+
+        <item>
+                <title>Efficient Privacy-Preserving ML Using TVM</title>
+                <description>&lt;p&gt;This post describes Myelin, a framework for privacy-preserving machine learning in trusted hardware enclaves, and how TVM makes Myelin fast.
+The key idea is that TVM, unlike other popular ML frameworks, compiles models into lightweight, optimized, and dependency-free libraries which can fit into resource constrained enclaves.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/tvmfits.png&quot; alt=&quot;TVM fits in enclaves&quot; style=&quot;width: 80vw; max-width: 600px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Give creating a privacy-preserving ML model a try! Check out the &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/sgx&quot;&gt;example code&lt;/a&gt; available in the TVM repo.&lt;/p&gt;
+
+&lt;h1 id=&quot;motivation-privacy-preserving-ml&quot;&gt;Motivation: Privacy-Preserving ML&lt;/h1&gt;
+
+&lt;p&gt;Machine learning models benefit from large and diverse datasets.
+Unfortunately, using such datasets often requires trusting a centralized data aggregator or computation provider.
+For sensitive applications like healthcare and finance this is undesirable as it could compromise patient privacy or divulge trade secrets.
+Recent advances in secure and privacy-preserving computation, including &lt;em&gt;trusted execution environments&lt;/em&gt; and &lt;em&gt;differential privacy&lt;/em&gt;, offer a way for mutually distrusting parties to efficiently train a machine learning model without compromising the training data.
+We use TVM to make privacy-preserving ML framework fast.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/sgx.png&quot; alt=&quot;Myelin workflow&quot; style=&quot;width: 80vw; max-width: 600px&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;use-cases&quot;&gt;Use Cases&lt;/h2&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;private MLaaS&lt;/strong&gt;: a cloud provider runs their architecture on your data. You get the model outputs, your data stays private, and the cloud provider knows that you can’t steal model.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;trustworthy ML competitions&lt;/strong&gt;: you train a model on contest data. The contest organizer sends private test data to your model and gets verifiable report of accuracy. Your model stays safe until the organizer decides to purchase it. Other participants can‘t cheat by training on test data.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;training on shared private data&lt;/strong&gt;: you (a researcher) want to train a model on several hospitals’ data. Directly sharing is too complicated. Instead, have a “trusted third party” train a privacy-preserving model.&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;http://www.vldb.org/pvldb/vol11/p2086-hynes.pdf&quot;&gt;&lt;strong&gt;ML on the Blockchain&lt;/strong&gt;&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;background&quot;&gt;Background&lt;/h1&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/dpnn.png&quot; alt=&quot;sketch of DP deep learning in a TEE&quot; style=&quot;width: 80vw; max-width: 400px&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;trusted-execution-environments&quot;&gt;Trusted Execution Environments&lt;/h2&gt;
+
+&lt;p&gt;A &lt;a href=&quot;https://en.wikipedia.org/wiki/Trusted_Computing#Remote_attestation&quot;&gt;trusted execution environment&lt;/a&gt; (TEE) essentially allows a remote user to provably run code on another person’s machine without revealing the computation to the hardware provider.&lt;/p&gt;
+
+&lt;p&gt;More technically, the TEE provides a secure &lt;em&gt;enclave&lt;/em&gt; of isolated/encrypted memory and CPU registers; also a trusted source of randomness.
+The TEE can also send a signed attestation of the code that’s loaded so that the remote user can verify that the enclave has been correctly loaded.
+This process, known as remote attestation, can be used to establish a secure communication channel into the enclave .
+The remote user can then provision it with secrets like private keys, model parameters, and training data.&lt;/p&gt;
+
+&lt;p&gt;Compared to pure crypto methods like &lt;a href=&quot;https://en.wikipedia.org/wiki/Garbled_circuit&quot;&gt;secure multi-parity computation (MPC)&lt;/a&gt; and &lt;a href=&quot;https://en.wikipedia.org/wiki/Homomorphic_encryption#Fully_homomorphic_encryption&quot;&gt;fully-homomorphic encryption (FHE)&lt;/a&gt;, TEEs are several orders of magnitude faster and support general-purpose computation (i.e. not just arithmetic operations).
+Perhaps the only drawbacks are the additional trust assumptions in the hardware root of trust (a key burned into the processor) and loaded software.&lt;/p&gt;
+
+&lt;p&gt;Trust assumptions notwithstanding, TEE technology is becoming increasingly widespread and is playing a major role in practical privacy-preservation.
+In fact, general-purpose TEEs already exist in commodity hardware like &lt;a href=&quot;https://software.intel.com/en-us/sgx&quot;&gt;Intel SGX&lt;/a&gt; and &lt;a href=&quot;https://genode.org/documentation/articles/trustzone&quot;&gt;ARM TrustZone&lt;/a&gt;.
+Additionally, the fully-open source &lt;a href=&quot;https://keystone-enclave.org&quot;&gt;Keystone enclave&lt;/a&gt; is on the way.&lt;/p&gt;
+
+&lt;h2 id=&quot;differential-privacy&quot;&gt;Differential Privacy&lt;/h2&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/sgx/dp.png&quot; alt=&quot;DP as false positive/negative&quot; style=&quot;width: 80vw; max-width: 500px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://en.wikipedia.org/wiki/Differential_Privacy#Randomized_Response&quot;&gt;Differential privacy (DP)&lt;/a&gt; provides a formal guarantee that models trained on similar datasets are indistinguishable
+Informally, a user’s privacy is not compromised by choosing to contribute data to a model.&lt;/p&gt;
+
+&lt;p&gt;In other words, given the output of an algorithm on two datasets which differ in only a single record, differential privacy upper bounds the probability that an adversary can determine which dataset.
+An algorithm may be made DP using a mechanism which adds noise to the algorithm’s output.
+The amount of noise is calibrated on how much the output depends on any particular inputs.
+If you’re familiar with hypothesis testing, if outcomes A and B each have probability 0.5, applying a DP mechanism is like convolving with a probability distribution: the privacy is in the false positive and false negative rates.
+Since deep learning models tend to generalize well, the amount of noise is often less than might be expected.&lt;/p&gt;
+
+&lt;p&gt;Running a DP training algorithm in a TEE ensures that the DP mechanism is faithfully applied.&lt;/p&gt;
+
+&lt;h1 id=&quot;efficient-privacy-preserving-ml-using-tvm&quot;&gt;Efficient Privacy-Preserving ML Using TVM&lt;/h1&gt;
+
+&lt;p&gt;One of the primary challenges of working with a TEE is that the code running within does not have access to the untrusted OS.
+This means that the trusted software cannot  create threads or perform I.O
+Practically speaking, the result is that numerical libraries like OpenBLAS–much less frameworks like PyTorch and TensorFlow–cannot run directly in enclaves.&lt;/p&gt;
+
+&lt;p&gt;TEEs actually have a similar programming model to resource-constrained hardware accelerators.
+This is exactly what TVM is made for!
+In the privacy workflow, a user first defines an entire training graph in the high-level graph specification language.
+TVM them compiles the model and outputs a static library containing optimized numerical kernels which can easily be loaded into a TEE.
+Since the kernels are automatically generated and have strict bounds checking, they expose a low surface area of attack.
+They are supported by a lightweight memory-safe Rust runtime which also may easily be reviewed for safety and correctness.&lt;/p&gt;
+
+&lt;p&gt;Of course, safety is most useful when practically applicable.
+Fortunately, TVM modules in enclaves have comparable performance to native CPU-based training.
+By coordinating threads using the untrusted runtime, a single TVM enclave can fully utilize the resources of its host machine.
+Moreover, it’s not difficult to imagine a secure parameter server which orchestrates entire datacenters of enclave-enabled machines.&lt;/p&gt;
+
+&lt;p&gt;TVM also provides opportunities for more subtle optimization of privacy-preserving algorithms.
+Indeed, its fine-grained scheduling features allow speedups when using differential privacy.
+For instance, the tightest DP bounds may be obtained from clipping the gradients of each training example and adding noise to each [1].
+In autograd frameworks, this requires forwarding the model for each example in the minibatch (though only one backward pass is needed) [2].
+Using TVM, however, per-example gradient clipping is straightforward: instead of scheduling each weight update as a single reduction over both batch and feature dimensions, the reduction is split into two.
+The reduction over features is followed by clipping and noising, and then the final result is finally summed over examples to obtain the weight update.
+Thus, TVM allows applying differential privacy without introducing overhead greater than what is required by the technique.
+Also, if one really wants to get really fancy, it’s possible to fuse the clipping and noising operations and apply them in-place to further trim down latency and memory usage.&lt;/p&gt;
+
+&lt;p&gt;For benchmarks on realistic workloads, please refer to the tech report &lt;a href=&quot;https://arxiv.org/abs/1807.06689&quot;&gt;&lt;em&gt;Efficient Deep Learning on Multi-Source Private Data&lt;/em&gt;&lt;/a&gt;.
+And, of course, feel free go give the framework a spin in the &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/sgx&quot;&gt;TVM SGX example&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h1 id=&quot;conclusion&quot;&gt;Conclusion&lt;/h1&gt;
+
+&lt;p&gt;The next generation of learning systems will be ushered in by privacy.
+As TEE technology becomes better understood and more widely available, it makes sense to leverage it as a resource for privacy-preserving machine learning and analytics.
+TVM is well poised to facilitate development of this use case in both research and deployment.&lt;/p&gt;
+
+&lt;h1 id=&quot;bio--acknowledgement&quot;&gt;Bio &amp;amp; Acknowledgement&lt;/h1&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://github.com/nhynes&quot;&gt;Nick&lt;/a&gt; is a PhD student in Prof. Dawn Song’s lab at UC Berkeley.
+His research interest is in the general domain of ML on shared private data, but this is really just an excuse to mess with Rust, security monitors, hardware enclaves, and compilers like TVM.&lt;/p&gt;
+
+&lt;p&gt;Thanks to Tianqi Chen for the code reviews!&lt;/p&gt;
+
+&lt;h1 id=&quot;references&quot;&gt;References&lt;/h1&gt;
+
+&lt;p&gt;[1] &lt;a href=&quot;https://arxiv.org/abs/1607.00133&quot;&gt;Deep Learning with Differential Privacy&lt;/a&gt;&lt;br /&gt;
+[2] &lt;a href=&quot;https://arxiv.org/pdf/1510.01799v2.pdf&quot;&gt;Efficient Per-Example Gradient Computations&lt;/a&gt;&lt;/p&gt;
+</description>
+                <link>https://tvm.apache.org/2018/10/09/ml-in-tees</link>
+                <guid>https://tvm.apache.org/2018/10/09/ml-in-tees</guid>
+                <pubDate>Tue, 09 Oct 2018 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>Automatic Kernel Optimization for Deep Learning on All Hardware Platforms</title>
+                <description>&lt;p&gt;Optimizing the performance of deep neural network on a diverse range of hardware platforms is still a hard
+problem for AI developers. In terms of system support, we are facing a many-to-many problem here:
+deploying trained models from multiple frontends (e.g. Tensorflow, ONNX, MXNet) to multiple
+hardware platforms (e.g. CPU, GPU, Accelerators). The most performance critical part of
+this problem is obtaining high performance kernel implementations for growing model
+architectures and hardware platforms.&lt;/p&gt;
+
+&lt;p&gt;To address this challenge, TVM takes a full stack compiler approach.
+TVM combines code generation and automatic program optimization to generate kernels
+that are comparable to heavily hand-optimized libraries,
+obtaining state-of-the-art inference performance on hardware platforms including
+ARM CPUs, Intel CPUs, Mali GPUs, NVIIDA GPUs and AMD GPUs.&lt;/p&gt;
+
+&lt;p&gt;In this blog post, we show the workflow of automatic kernel optimization in TVM compiler stack and 
+benchmark results on several hardware platforms.&lt;/p&gt;
+
+&lt;h1 id=&quot;system-overview&quot;&gt;System Overview&lt;/h1&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/autotune-all/overview.png&quot; alt=&quot;image&quot; width=&quot;35%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 1. System Overview &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;Kernel optimization in TVM is done in an iterative loop fashion.
+As shown in Figure 1, the automatic kernel optimization takes a neural network (typically in computational graph representation)
+from frontend frameworks as input, and generates kernels for all operators in this network.&lt;/p&gt;
+
+&lt;p&gt;The inner loop uses a scalable RPC runtime, machine learning based tuners and a tensor compiler.
+In each round of the loop, the tuner picks a batch of promising candidate kernel implementations from a large search space,
+and profile them on real hardware. Then the tuner gets the profiling results. These profiling results are used as training 
+data to fit a prediction model. After fitting the prediction model, the tuner picks the next promising candidates according to the predictions,
+and the loop continues. This way, we search for fast kernels iteratively.&lt;/p&gt;
+
+&lt;p&gt;The below figure compares traditional auto-tuning and AutoTVM. 
+The major difference is that AutoTVM is&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;Scalable&lt;/strong&gt; to heterogenous cluster of devices&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;Learning&lt;/strong&gt; to optimize tensor programs with a transferable machine learning cost model&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;You can refer to our paper[1] for more details.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/autotune-all/autotvm.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+&lt;center&gt; Figure 2. Comparison of Traditional Auto-tuning and AutoTVM &lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;begin-tuning&quot;&gt;Begin Tuning&lt;/h2&gt;
+&lt;p&gt;For demonstration, we run our optimization for resnet-18 on RK3399, an ARM development board.
+The detailed instructions are omitted due to the space limit of a blog post.
+Links to tutorials for ARM CPU, Mali GPU, NVIDIA GPU, AMD GPU are all available at the end of this blog.&lt;/p&gt;
+
+&lt;p&gt;First we get a pre-trained model from MXNet model zoo, and extract tuning tasks from it.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;mxnet.gluon.model_zoo.vision&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_model&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;block&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_model&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'resnet18_v1'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pretrained&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/ [...]
+&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nnvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;frontend&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt [...]
+
+&lt;span class=&quot;n&quot;&gt;tasks&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;autotvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;extract_from_graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;tune_tasks&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tasks&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tuning_option&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;There are 12 different conv2d layers in resnet-18, so we launch 12 tuning tasks.
+For each of them, the tuner makes several hundreds of trials and picks the best one.
+After finishing all tuning tasks, we compile the whole network and generate a single deployable minimal library.
+One sample output is&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Extract tasks...
+Tuning...
+[Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
+[Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
+[Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
+[Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
+[Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
+[Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
+[Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
+[Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
+[Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
+[Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
+[Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
+[Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
+Compile...
+Upload...
+Evaluate inference time cost...
+Mean inference time (std dev): 162.59 ms (0.06 ms)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;The tuning is especially helpful and worth a try if your model has some strange shapes or
+your hardware is customized, as hand-optimized static libraries cannot consider all situations.&lt;/p&gt;
+
+&lt;h1 id=&quot;benchmark-results&quot;&gt;Benchmark Results&lt;/h1&gt;
+&lt;p&gt;We pre-tuned some popular networks on our device cluster and released the following benchmark.
+Instructions for reproduction are at the end of this blog.&lt;/p&gt;
+
+&lt;p&gt;Comprehensively benchmarking TVM is easy since we have a unified runtime interface.
+However maintaining complete, up-to-date, and correct comparisons against all other platforms is not feasible
+without expert assistance from the developers of many other projects.
+So we put all our numbers in a table, and then provide an incomplete comparison with some other libraries.&lt;/p&gt;
+
+&lt;h2 id=&quot;comparison&quot;&gt;Comparison&lt;/h2&gt;
+&lt;p&gt;We validate the effectiveness of our automatic optimization stack by 
+comparing with heavily optimized traditional libraries on each platform.&lt;/p&gt;
+
+&lt;p&gt;We tested popular image classification networks on ImageNet (3x224x224) dataset with batch size = 1 and data type = float32.
+The reported numbers are time costs per image in milliseconds.&lt;/p&gt;
+
+&lt;h3 id=&quot;arm-cpu&quot;&gt;ARM CPU&lt;/h3&gt;
+
+&lt;p&gt;We choose &lt;a href=&quot;https://github.com/Tencent/ncnn&quot;&gt;NCNN&lt;/a&gt;, a widely used, hand-optimized kernel library as baseline.
+It makes extensive use of NEON assembly instructions. For example, the code base contains
+&lt;a href=&quot;https://github.com/Tencent/ncnn/blob/master/src/layer/arm/convolution_3x3.h&quot;&gt;13k lines of code&lt;/a&gt; for only 3x3 convolution layers.
+We reference the benchmark numbers in their project repository.
+As shown in the figure below, TVM outperforms it for all networks on Rasbperry Pi 3B.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/arm.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;mali-gpu&quot;&gt;Mali GPU&lt;/h3&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://github.com/ARM-software/ComputeLibrary&quot;&gt;ARM Compute Library&lt;/a&gt; is a vendor provided library that supports Mali GPU (OpenCL) well.
+According to the results, TVM provides stronger performance in ResNet and MobileNet due to advantages in convolutional layers.
+TVM lags behind a bit on vgg-16 because vgg-16 is an old and huge network and has several large dense layers.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/mali.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;nvidia-gpu&quot;&gt;NVIDIA GPU&lt;/h3&gt;
+
+&lt;p&gt;On NVIDIA GPU, &lt;a href=&quot;https://developer.nvidia.com/cudnn&quot;&gt;CuDNN&lt;/a&gt; and &lt;a href=&quot;https://developer.nvidia.com/tensorrt&quot;&gt;TensorRT&lt;/a&gt; are two vendor-provided libraries for training and inference respectively. Since we focus on inference,
+we run our benchmark in the unbatched setting. Another tensor compiler &lt;a href=&quot;https://github.com/plaidml/plaidml&quot;&gt;PlaidML&lt;/a&gt; is also reported as baseline
+as there is a previous benchmark of it compared against a pre-AutoTVM version of TVM.
+We reference its benchmark results from &lt;a href=&quot;https://github.com/plaidml/plaidbench&quot;&gt;PlaidBench&lt;/a&gt;.
+According to the results below, TVM achieves parity with TensorRT performance.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/nvidia.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;amd-gpu&quot;&gt;AMD GPU&lt;/h3&gt;
+
+&lt;p&gt;We also take a quick look at a AMD GPU. TVM supports OpenCL and &lt;a href=&quot;https://rocm.github.io/&quot;&gt;ROCm&lt;/a&gt; backend. We found ROCm is better since
+it is more specialized for AMD GPUs.
+&lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/MIOpen&quot;&gt;MIOpen&lt;/a&gt; is a vendor provided
+kernel library. TVM’s graph runtime can call MIOpen’s kernel implementations directly, so we report
+the baseline performance by using this integration.&lt;/p&gt;
+
+&lt;p&gt;We didn’t do any specific optimization for AMD GPU. All computation definition and schedule code for NVIDIA GPU is directly reused.
+As a result, TVM is a little bit slower then MIOpen in most cases.
+We believe there is still room for improvement.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/autotune-all/amd.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;all-our-results&quot;&gt;All Our Results&lt;/h2&gt;
+&lt;p&gt;We tested the following networks on ImageNet (3x224x224) dataset with batch size = 1 and data type = float32.
+The reported numbers are time costs per image in milliseconds.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+      &lt;th&gt; &lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;densenet121&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;inception v3&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;mobilenet&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;mobilenet v2&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;resnet18&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;resnet50&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;squeezenet v1.0&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;squeezenet v1.1&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;vgg16&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;vgg19&lt;/strong&gt;&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;ARM CPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Huawei P20 Pro&lt;/td&gt;
+      &lt;td&gt;181.4&lt;/td&gt;
+      &lt;td&gt;439.9&lt;/td&gt;
+      &lt;td&gt;41.1&lt;/td&gt;
+      &lt;td&gt;34.5&lt;/td&gt;
+      &lt;td&gt;76.5&lt;/td&gt;
+      &lt;td&gt;208.2&lt;/td&gt;
+      &lt;td&gt;51.8&lt;/td&gt;
+      &lt;td&gt;25.7&lt;/td&gt;
+      &lt;td&gt;480.6&lt;/td&gt;
+      &lt;td&gt;627.0&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Google Pixel2&lt;/td&gt;
+      &lt;td&gt;162.2&lt;/td&gt;
+      &lt;td&gt;433.5&lt;/td&gt;
+      &lt;td&gt;39.5&lt;/td&gt;
+      &lt;td&gt;30.1&lt;/td&gt;
+      &lt;td&gt;61.1&lt;/td&gt;
+      &lt;td&gt;181.3&lt;/td&gt;
+      &lt;td&gt;47.3&lt;/td&gt;
+      &lt;td&gt;23.2&lt;/td&gt;
+      &lt;td&gt;391.1&lt;/td&gt;
+      &lt;td&gt;487.7&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Firefly RK3399&lt;/td&gt;
+      &lt;td&gt;335.9&lt;/td&gt;
+      &lt;td&gt;1285.9&lt;/td&gt;
+      &lt;td&gt;78.6&lt;/td&gt;
+      &lt;td&gt;66.7&lt;/td&gt;
+      &lt;td&gt;161.2&lt;/td&gt;
+      &lt;td&gt;403.8&lt;/td&gt;
+      &lt;td&gt;94.6&lt;/td&gt;
+      &lt;td&gt;48.5&lt;/td&gt;
+      &lt;td&gt;902.9&lt;/td&gt;
+      &lt;td&gt;1090.1&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Raspberry Pi 3B&lt;/td&gt;
+      &lt;td&gt;609.5&lt;/td&gt;
+      &lt;td&gt;2070.4&lt;/td&gt;
+      &lt;td&gt;122.2&lt;/td&gt;
+      &lt;td&gt;103.7&lt;/td&gt;
+      &lt;td&gt;322.5&lt;/td&gt;
+      &lt;td&gt;725.8&lt;/td&gt;
+      &lt;td&gt;185.1&lt;/td&gt;
+      &lt;td&gt;94.1&lt;/td&gt;
+      &lt;td&gt;1759.6&lt;/td&gt;
+      &lt;td&gt;2118.6&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Xilinx PYNQ&lt;/td&gt;
+      &lt;td&gt;2888.3&lt;/td&gt;
+      &lt;td&gt;9709.1&lt;/td&gt;
+      &lt;td&gt;723.5&lt;/td&gt;
+      &lt;td&gt;514.3&lt;/td&gt;
+      &lt;td&gt;1234.6&lt;/td&gt;
+      &lt;td&gt;3580.5&lt;/td&gt;
+      &lt;td&gt;909.9&lt;/td&gt;
+      &lt;td&gt;477.3&lt;/td&gt;
+      &lt;td&gt;&lt;sup&gt;-(Note 1)&lt;/sup&gt;&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;Mali GPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Mali-T860&lt;/td&gt;
+      &lt;td&gt;410.9&lt;/td&gt;
+      &lt;td&gt;783.1&lt;/td&gt;
+      &lt;td&gt;75.4&lt;/td&gt;
+      &lt;td&gt;70.8&lt;/td&gt;
+      &lt;td&gt;128.6&lt;/td&gt;
+      &lt;td&gt;352.9&lt;/td&gt;
+      &lt;td&gt;106.2&lt;/td&gt;
+      &lt;td&gt;58.0&lt;/td&gt;
+      &lt;td&gt;679.5&lt;/td&gt;
+      &lt;td&gt;805.3&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;NVIDIA GPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GTX 1080 Ti&lt;/td&gt;
+      &lt;td&gt;3.6&lt;/td&gt;
+      &lt;td&gt;5.8&lt;/td&gt;
+      &lt;td&gt;0.6&lt;/td&gt;
+      &lt;td&gt;- &lt;sup&gt;(Note 2) &lt;/sup&gt;&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;2.7&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;4.0&lt;/td&gt;
+      &lt;td&gt;4.6&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GTX TITAN X&lt;/td&gt;
+      &lt;td&gt;5.8&lt;/td&gt;
+      &lt;td&gt;9.7&lt;/td&gt;
+      &lt;td&gt;1.0&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;4.3&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;6.4&lt;/td&gt;
+      &lt;td&gt;7.5&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;&lt;strong&gt;AMD GPU&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;AMD Vega FE&lt;/td&gt;
+      &lt;td&gt;5.7&lt;/td&gt;
+      &lt;td&gt;8.8&lt;/td&gt;
+      &lt;td&gt;1.0&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;4.5&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;-&lt;/td&gt;
+      &lt;td&gt;5.9&lt;/td&gt;
+      &lt;td&gt;7.0&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+      &lt;td&gt; &lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Note 1: Out of memory on this board.&lt;/li&gt;
+  &lt;li&gt;Note 2: We didn’t tune some small networks on GPU due to time constraints.
+When profiling data is not available, TVM can use fallback code generation. 
+But competitive performance is not guaranteed in this scenario.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;conclusion&quot;&gt;Conclusion&lt;/h1&gt;
+&lt;p&gt;With an expressive code generator and an efficient search algorithm, we are able to
+generate kernels that are comparable to heavily hand-optimized ones.
+Since programmer time is expensive and machine time is getting cheaper,
+we believe automatic optimization with real hardware and data in the loop will be the standard workflow
+for inference deployment. TVM just provides such a solution.&lt;/p&gt;
+
+&lt;h2 id=&quot;links&quot;&gt;Links&lt;/h2&gt;
+&lt;p&gt;[1] benchmark: &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/benchmark&quot;&gt;https://github.com/dmlc/tvm/tree/master/apps/benchmark&lt;/a&gt;&lt;br /&gt;
+[2] Tutorial on tuning for ARM CPU: &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html&quot;&gt;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html&lt;/a&gt;&lt;br /&gt;
+[3] Tutorial on tuning for Mobile GPU: &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_mobile_gpu.html&quot;&gt;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_mobile_gpu.html&lt;/a&gt;&lt;br /&gt;
+[4] Tutorial on tuning for NVIDIA/AMD GPU: &lt;a href=&quot;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_cuda.html&quot;&gt;https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_cuda.html&lt;/a&gt;&lt;br /&gt;
+[5] Paper about AutoTVM: &lt;a href=&quot;https://arxiv.org/abs/1805.08166&quot;&gt;Learning to Optimize Tensor Program&lt;/a&gt;&lt;br /&gt;
+[6] Paper about Intel CPU (by AWS contributors) :  &lt;a href=&quot;https://arxiv.org/abs/1809.02697&quot;&gt;Optimizing CNN Model Inference on CPUs&lt;/a&gt;&lt;/p&gt;
+
+</description>
+                <link>https://tvm.apache.org/2018/10/03/auto-opt-all</link>
+                <guid>https://tvm.apache.org/2018/10/03/auto-opt-all</guid>
+                <pubDate>Wed, 03 Oct 2018 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>Building a Cross-Framework Deep Learning Compiler via DLPack</title>
+                <description>&lt;p&gt;Deep learning frameworks such as Tensorflow, PyTorch, and ApacheMxNet provide a
+powerful toolbox for quickly prototyping and deploying deep learning models.
+Unfortunately, their ease-of-use has often come at the cost of fragmentation: it
+is only easy to use each framework in isolation. Vertical integration has made
+development streamlined for common use cases, but venturing off of the beaten
+path can be tricky.&lt;/p&gt;
+
+&lt;p&gt;One scenario that is poorly supported is passing tensors
+&lt;em&gt;directly&lt;/em&gt; from one framework to another in memory, without any data duplication
+or copies. Supporting such a use case would enable users to string together
+pipelines where certain operators are better supported in one framework (or
+faster) than another efficiently. A shared data representation between
+frameworks would also bridge this gap, and allow compiler stacks to target a
+single format when generating code for operators.&lt;/p&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://github.com/dmlc/dlpack&quot;&gt;DLPack&lt;/a&gt; is an intermediate in-memory
+representation standard for tensor data structures. With DLPack as a common
+representation, we can leverage TVM in scripts written for frameworks that
+traditionally could only rely on vendor-provided libraries. TVM packed functions
+can operate on DLPack tensors, providing wrappers bridging tensor data
+structures from frameworks such as PyTorch and MxNet &lt;em&gt;with zero-data-copy&lt;/em&gt;.&lt;/p&gt;
+
+&lt;p&gt;DLPack presents a simple, portable in-memory data structure:&lt;/p&gt;
+&lt;div class=&quot;language-c highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;typedef&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;struct&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*!
+   * \brief The opaque data pointer points to the allocated data.
+   *  This will be CUDA device pointer or cl_mem handle in OpenCL.
+   *  This pointer is always aligns to 256 bytes as in CUDA.
+   */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The device context of the tensor */&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;DLContext&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief Number of dimensions */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ndim&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The data type of the pointer*/&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;DLDataType&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The shape of the tensor */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;int64_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*!
+   * \brief strides of the tensor,
+   *  can be NULL, indicating tensor is compact.
+   */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;int64_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;strides&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+  &lt;span class=&quot;cm&quot;&gt;/*! \brief The offset in bytes to the beginning pointer to data */&lt;/span&gt;
+  &lt;span class=&quot;kt&quot;&gt;uint64_t&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;byte_offset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DLTensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;As an example, we declare and compile a matrix multiplication operator in TVM,
+and build a wrapper that uses the DLPack representation to allow this operator
+to support PyTorch tensors. We also repeat this demonstration with MxNet. This
+extension allows machine learning developers to quickly port research code to
+relatively unsupported hardware platforms without sacrificing performance.&lt;/p&gt;
+
+&lt;p&gt;Illustration of how DLPack provides an intermediate wrapper that is shared
+between frameworks and TVM:&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/pytorch-dlpack/dlpack.png&quot; alt=&quot;image&quot; width=&quot;65%&quot; /&gt;&lt;br /&gt;
+Figure 1&lt;/p&gt;
+
+&lt;p&gt;First, we compute a reference output in PyTorch:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;mm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We then define and build a TVM matrix multiplication operator, using the default
+schedule:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;convert&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;& [...]
+    &lt;span class=&quot;n&quot;&gt;X&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;Y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+
+    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
+    &lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class= [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fmm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&q [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;For brevity, we do not cover TVM’s large collection of scheduling primitives
+that we can use to optimize matrix multiplication. If you wish to make a custom
+GEMM operator run &lt;em&gt;fast&lt;/em&gt; on your hardware device, a detailed tutorial can be
+found &lt;a href=&quot;https://docs.tvm.ai/tutorials/optimize/opt_gemm.html&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;We then convert the TVM function into one that supports PyTorch tensors:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib.dlpack&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_pytorch_func&lt;/span&gt;
+    &lt;span class=&quot;c1&quot;&gt;# fmm is the previously built TVM function (Python function)
+&lt;/span&gt;    &lt;span class=&quot;c1&quot;&gt;# fmm is the wrapped TVM function (Python function)
+&lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;fmm_pytorch&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_pytorch_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fmm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;z2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fmm_pytorch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;  [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;and verify that the results match.&lt;/p&gt;
+
+&lt;p&gt;We can repeat the same example, but using MxNet instead:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;mxnet&lt;/span&gt;
+    &lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib.mxnet&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_mxnet_func&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cpu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span clas [...]
+    &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;f_mxnet&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_mxnet_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;f_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;under-the-hood-of-the-pytorch-example&quot;&gt;Under the hood of the PyTorch Example&lt;/h2&gt;
+&lt;p&gt;As TVM provides &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L455&quot;&gt;functions&lt;/a&gt; to convert dlpack tensors to tvm &lt;code class=&quot;highlighter-rouge&quot;&gt;NDArray&lt;/code&gt;s and
+vice-versa, so all that is needed is some syntactic sugar by wrapping functions.
+&lt;code class=&quot;highlighter-rouge&quot;&gt;convert_func&lt;/code&gt; is a generic converter for frameworks using tensors with dlpack
+support, and can be used to implement convenient converters, such as
+&lt;code class=&quot;highlighter-rouge&quot;&gt;to_pytorch_func&lt;/code&gt;.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;convert_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_type&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+    &lt;span class=&quot;k&quot;&gt;assert&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;callable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;_wrapper&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ndarray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_dlpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;to_dlpack_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
+            &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_type&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;f [...]
+        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_wrapper&lt;/span&gt;
+
+&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;to_pytorch_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch&lt;/span&gt;
+    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch.utils.dlpack&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;convert_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&l [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+</description>
+                <link>https://tvm.apache.org/2018/08/10/DLPack-Bridge</link>
+                <guid>https://tvm.apache.org/2018/08/10/DLPack-Bridge</guid>
+                <pubDate>Fri, 10 Aug 2018 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>VTA: An Open, Customizable Deep Learning Acceleration Stack </title>
+                <description>&lt;p style=&quot;text-align: center&quot;&gt;Thierry Moreau(VTA architect), Tianqi Chen(TVM stack), Ziheng Jiang†(graph compilation), Luis Vega(cloud deployment)&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Advisors: Luis Ceze, Carlos Guestrin, Arvind Krishnamurthy&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington&lt;/p&gt;
+
+&lt;p&gt;Hardware acceleration is an enabler for ubiquitous and efficient deep learning. With hardware accelerators appearing in the datacenter and edge devices, hardware specialization has taken on a prominent role in the deep learning system stack.&lt;/p&gt;
+
+&lt;p&gt;We are excited to announce the launch of the Versatile Tensor Accelerator (VTA, pronounced &lt;em&gt;vita&lt;/em&gt;), an open, generic, and customizable deep learning accelerator design. VTA is a programmable accelerator that exposes a RISC-like programming abstraction to describe tensor-level operations. We designed VTA to expose the most salient and common characteristics of mainstream deep learning accelerators, such as tensor operations, DMA load/stores, and explicit comput [...]
+
+&lt;p&gt;VTA is more than a standalone accelerator design: it’s an end-to-end solution that includes drivers, a JIT runtime, and an optimizing compiler stack based on TVM. The current release includes a behavioral hardware simulator, as well as the infrastructure to deploy VTA on low-cost FPGA hardware for fast prototyping. By extending the TVM stack with a customizable, and open source deep learning hardware accelerator design, we are exposing a transparent end-to-end deep learning stac [...]
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_stack.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The VTA and TVM stack together constitute a blueprint for end-to-end, accelerator-centric deep learning system that can:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Provide an open deep learning system stack for hardware, compilers, and systems researchers alike to incorporate optimizations and co-design techniques.&lt;/li&gt;
+  &lt;li&gt;Lower the barrier of entry for machine learning practitioners to experiment with novel network architectures, operators and data representations that require specialized hardware support.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;use-case-scenarios-for-researchers&quot;&gt;Use-Case Scenarios for Researchers&lt;/h2&gt;
+
+&lt;p&gt;We highlight ways in which the VTA design together with a complete TVM software stack can enable novel opportunities across hardware, compilers, and deep learning research.&lt;/p&gt;
+
+&lt;h3 id=&quot;hardware-designers-and-computer-architects&quot;&gt;Hardware Designers and Computer Architects&lt;/h3&gt;
+
+&lt;p&gt;With new ASIC designs being regularly announced, providing a complete and usable software stack on top of novel hardware is essential to gain a competitive edge both in research circles, and commercially.
+Our VTA release provides a reference TVM software stack built for hardware accelerators.
+We hope to empower hardware designers to quickly build and deploy optimized deep learning libraries ready to be utilized by high-level frameworks of the likes of TensorFlow or PyTorch.
+Software support is essential for performing full-system evaluation to understand the limits and performance bottlenecks in hardware-accelerated systems.
+With the use of FPGAs as hardware deployment backends, we provide a complete solution for rapid and iterative hardware design prototyping.
+Finally, our vision is to see VTA grow into an collection of hardware designs, eventually leading to an open ecosystem of custom hardware accelerators.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;https://www.acm.org/binaries/content/gallery/acm/ctas/publications/artifact-badges.jpg/artifact-badges.jpg/acm%3Adesktopcta&quot; alt=&quot;image&quot; width=&quot;20%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;In addition, VTA is one of the first hardware-software reproducible &lt;a href=&quot;http://ctuning.org/ae/&quot;&gt;ACM artifacts&lt;/a&gt;, which can serve as a template for reproducible deep learning architecture research.
+The VTA artifact deployable using &lt;a href=&quot;http://cknowledge.org/&quot;&gt;CK&lt;/a&gt;, was presented at ReQuEST 2018, co-located with &lt;a href=&quot;http://cknowledge.org/request-cfp-asplos2018.html&quot;&gt;ASPLOS&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h3 id=&quot;optimizing-compilers-researchers&quot;&gt;Optimizing Compilers Researchers&lt;/h3&gt;
+
+&lt;p&gt;Novel intermediate representations and optimizing compilers of the likes of TVM have been proposed to better take advantage of deep learning workloads characteristics.
+VTA complements TVM to provide accelerator-centric optimization passes, and low-level code generation. Our open-source deep learning compiler stack also aims to emulate the success of LLVM, by allowing the community to improve accelerator-centric compiler support over time, particularly as more hardware variants of VTA emerge.
+The extendability of the compiler stack, combined with the ability to modify the architecture and the programming interface of the hardware back-end should lead to exciting opportunities in hardware-software co-design for deep learning.&lt;/p&gt;
+
+&lt;h3 id=&quot;deep-learning-researchers&quot;&gt;Deep Learning Researchers&lt;/h3&gt;
+
+&lt;p&gt;A transparent and customizable software and hardware stack empowers deep learning researchers to come up with novel neural network operators, and data representations, all the while enabling the complete evaluation of those optimizations on end-to-end systems. Techniques like binarization are currently limited to CPU and GPU evaluations, unless significant engineering resources are dedicated to produce an FPGA or ASIC design that can evaluate the technique’s full energy savings  [...]
+
+&lt;h2 id=&quot;technical-details&quot;&gt;Technical Details&lt;/h2&gt;
+
+&lt;h3 id=&quot;stack-overview&quot;&gt;Stack Overview&lt;/h3&gt;
+
+&lt;p&gt;The VTA deep learning accelerator and TVM stack can bridge the gap between productivity-oriented deep learning frameworks, and performance-focused hardware substrates, such as FPGAs.&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;NNVM, the graph-level optimizer, provides a graph-level Intermediate Representation (IR) used as a common language between different deep learning frameworks to take advantage of graph-level optimizations, such as operator fusion. The NNVM IR is also used to specify data layout and data format constraints: e.g. tiling for tensorization, and bit-packing for ultra-low precision computing.&lt;/li&gt;
+  &lt;li&gt;TVM, the tensor-level optimizer, builds upon the Halide DSL and schedule primitives to provide an optimizing compiler capable of bringing performance portability for deep learning across hardware back-ends. TVM brings novel scheduling primitives that target specialized hardware accelerators, such as tensorization, which lowers computation onto specialized tensor-tensor hardware instructions. In addition, it provides schedule primitives and lowering rules that allow for explic [...]
+  &lt;li&gt;The VTA runtime performs JIT compilation of VTA binaries (instruction streams and micro-kernel code), manages shared memory, and performs synchronization to hand off execution to VTA. The VTA runtime presents an API that looks generic to TVM, to hide complexities of platform-specific bookkeeping tasks. It exposes a C++ API that a TVM module can call into - this simplifies the future inclusion of other hardware accelerator designs, without having to drastically modify the uppe [...]
+  &lt;li&gt;VTA’s two-level ISA provides both (1) a high-level CISC ISA that describes variable latency operations such as DMA loads, or deep learning operators and (2) a low-level, and fixed latency RISC ISA that describe low-level matrix-matrix operations. This two-level ISA allows both code compactness, and expressiveness.&lt;/li&gt;
+  &lt;li&gt;Finally, VTA’s micro-architecture provides a flexible deep learning hardware design specification, that can be conveniently compiled onto other FPGA platforms, and eventually in the long term down to ASICs.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h3 id=&quot;vta-hardware-design-overview&quot;&gt;VTA Hardware Design Overview&lt;/h3&gt;
+
+&lt;p&gt;The Vanilla Tensor Accelerator (VTA) is a generic deep learning accelerator built around a GEMM core, which performs dense matrix multiplication at a high computational throughput.
+The design is inspired by mainstream deep learning accelerators, of the likes of Google’s TPU accelerator. The design adopts decoupled access-execute to hide memory access latency and maximize utilization of compute resources. To a broader extent, VTA can serve as a template deep learning accelerator design, exposing a clean tensor computation abstraction to the compiler stack.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_overview.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The figure above presents a high-level overview of the VTA hardware organization. VTA is composed of four modules that communicate between each other via FIFO queues and single-writer/single-reader SRAM memory blocks, to allow for task-level pipeline parallelism.
+The compute module performs both dense linear algebra computation with its GEMM core, and general computation with its tensor ALU.
+It operates on a register file which instead of storing scalar values, stores tensors of rank 1 or 2.
+The micro-op cache stores low-level code that dictates a sequence of operations to mutate the register file.&lt;/p&gt;
+
+&lt;p&gt;The VTA hardware design template offers modularity to the user, with the option to modify hardware datatypes, memory architecture, the GEMM core dimensions, hardware operators, and pipelining stages.
+Exposing multiple variants of VTA to the compiler stack facilitates the developments of compilers, since we can test TVM’s ability to target an multitude of hardware accelerators, rather than a single design.&lt;/p&gt;
+
+&lt;h3 id=&quot;vta-prototyping-with-vta-simulator-and-pynq-fpga-board&quot;&gt;VTA Prototyping with VTA Simulator and Pynq FPGA Board&lt;/h3&gt;
+
+&lt;p&gt;The VTA release allows users to experiment with hardware acceleration, and accelerator-centric compiler optimizations in two ways.
+The first approach, which doesn’t require special hardware is to run deep learning workloads on a behavioral simulator of the VTA design.
+This simulator back-end is readily available for developers to experiment with.
+The second approach relies on an off-the-shelf and low-cost FPGA development board – the &lt;a href=&quot;http://www.pynq.io/&quot;&gt;Pynq board&lt;/a&gt;, which exposes a reconfigurable FPGA fabric and an ARM SoC.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_system.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The VTA release offers a simple compilation and deployment flow of the VTA hardware design and TVM workloads on the Pynq platform, with the help of an RPC server interface.
+The RPC server handles FPGA reconfiguration tasks and TVM module invocation offloading onto the VTA runtime.
+The VTA runtime system runs on the ARM CPU of the Pynq embedded system, and generates VTA binaries on the fly to offload to the FPGA hardware.
+This complete solution allows for out-of-the-box prototyping on low-cost FPGAs, with an interactive and familiar Python environment, hiding much of the complexity and headaches of FPGA design away from the user.&lt;/p&gt;
+
+&lt;p&gt;For programmers familiar with hardware and FPGAs, we expose the VTA design expressed in HLS C, and provide scripts built on top of the Xilinx toolchains to compile the design into an FPGA bitstream.
+We are currently building a repository of VTA variants, so that users can explore different design variants for their deep learning workloads without having to go through the time consuming FPGA compilation process.&lt;/p&gt;
+
+&lt;h2 id=&quot;performance-assessment&quot;&gt;Performance Assessment&lt;/h2&gt;
+
+&lt;p&gt;&lt;em&gt;VTA is at its early stages of development and we expect more performance improvements and optimizations to come.
+As of now we offer end-to-end performance evaluations on the low-cost Pynq board which incorporates a dated 28nm FPGA fabric.
+While this platform is meant for prototyping (the 2012 FPGA cannot compete with modern ASICs), we are porting VTA to newer high-performance FPGA platforms that will offer more competitive performance.&lt;/em&gt;&lt;/p&gt;
+
+&lt;p&gt;&lt;em&gt;We are working on more experiments and will release new results as they are obtained.&lt;/em&gt;&lt;/p&gt;
+
+&lt;h3 id=&quot;resource-utilization-on-resnet-18&quot;&gt;Resource Utilization on ResNet-18&lt;/h3&gt;
+
+&lt;p&gt;A popular method used to assess the efficient use of hardware are roofline diagrams: given a hardware design, how efficiently are different workloads utilizing the hardware compute and memory resources. The roofline plot below shows the throughput achieved on different convolution layers of the ResNet-18 inference benchmark. Each layer has a different arithmetic intensity, i.e. compute to data movement ratio.
+In the left half, convolution layers are bandwidth limited, whereas on the right half, they are compute limited.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_roofline.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The goal behind designing a hardware architecture, and a compiler stack is to bring each workload as close as possible to the roofline of the target hardware.
+The roofline plot shows the effects of having the hardware and compiler work together to maximize utilization of the available hardware resources.
+The technique showcased is latency hiding, which requires explicit dependence tracking at the hardware level, compiler support to partition work, explicit dependence insertion in the instruction stream during code-generation.
+The result is an overall higher utilization of the available compute and memory resources.&lt;/p&gt;
+
+&lt;h3 id=&quot;end-to-end-resnet-18-evaluation&quot;&gt;End to end ResNet-18 evaluation&lt;/h3&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;http://raw.githubusercontent.com/uwsaml/web-data/master/vta/blogpost/vta_e2e.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;A benefit of having a complete compiler stack built for VTA is the ability to run end-to-end workloads. This is compelling in the context of hardware acceleration because we need to understand what performance bottlenecks, and Amdahl limitations stand in the way to obtaining faster performance.
+The bar plot above shows inference performance with and without offloading the ResNet convolutional layers to the FPGA-based VTA design, on the Pynq board’s ARM Cortex A9 SoC.
+At a glance, it iss clear that VTA accomplishing its goal, reducing the time it takes to perform convolutions on the CPU (dark blue).
+However, it becomes apparent that other operators need offloading, as they now constitute a new bottleneck.
+This kind of high-level visibility is essential to system designers who want to understand how systems affect end-to-end performance.&lt;/p&gt;
+
+&lt;h2 id=&quot;open-source-effort&quot;&gt;Open Source Effort&lt;/h2&gt;
+&lt;p&gt;VTA is research effort at the Paul G. Allen School Computer Science and Engineering at University of Washington, and is now integrated into the TVM stack. The TVM project follows the Apache open-source model, to create a community maintained project. You are more than welcome to join us and lead the effort.&lt;/p&gt;
+
+&lt;h2 id=&quot;acknowledgements&quot;&gt;Acknowledgements&lt;/h2&gt;
+&lt;p&gt;VTA is a research project that came out of the SAML group, which is generously supported by grants from DARPA and the National Science Foundation and gifts from Huawei, Oracle, Intel and anonymous donors.&lt;/p&gt;
+
+&lt;h2 id=&quot;get-started&quot;&gt;Get Started!&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;TVM and VTA Github page can be found here: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;.&lt;/li&gt;
+  &lt;li&gt;You can get started with easy to follow &lt;a href=&quot;https://docs.tvm.ai/vta/tutorials/index.html&quot;&gt;tutorials on programming VTA with TVM&lt;/a&gt;.&lt;/li&gt;
+  &lt;li&gt;For more technical details on VTA, read our &lt;a href=&quot;https://arxiv.org/abs/1807.04188&quot;&gt;VTA technical report&lt;/a&gt; on ArXiv.&lt;/li&gt;
+&lt;/ul&gt;
+</description>
+                <link>https://tvm.apache.org/2018/07/12/vta-release-announcement</link>
+                <guid>https://tvm.apache.org/2018/07/12/vta-release-announcement</guid>
+                <pubDate>Thu, 12 Jul 2018 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>Bringing TVM into TensorFlow for Optimizing Neural Machine Translation on GPU</title>
+                <description>&lt;h2 id=&quot;author&quot;&gt;Author&lt;/h2&gt;
+
+&lt;p&gt;This is a guest blogpost contributed by Alibaba Group’s Machine Translation Platform team and PAI-Blade team&lt;/p&gt;
+
+&lt;h2 id=&quot;background&quot;&gt;Background&lt;/h2&gt;
+
+&lt;p&gt;Neural Machine Translation (NMT) is an end-to-end approach for automating translation, with the potential to overcome the weaknesses in conventional phrase-based translation systems. Recently, Alibaba Group is working on deploying NMT service for global e-commerce.&lt;/p&gt;
+
+&lt;p&gt;Currently we are exploiting Transformer [1] as the major backbone in our NMT system since it is more friendly for efficient offline training with on-par (even higher) precison against classical RNN/LSTM-based models. Although Transformer is friendly for the offline training phase as it breaks the dependencies across time steps, it is not quite efficiency for online inference. In our production environment, it has been found that the inference speed of the intial version of Trans [...]
+One paricular challenge we observed, is that batch matmul is a major performance hot-spot in Transformer and the current implementation in cuBLAS is not well optimized.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nmt-transformer/model_arch.png&quot; alt=&quot;image&quot; width=&quot;40%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The results below show that TVM generated kernel (with schdule optimization) brings at least &lt;b&gt;&lt;em&gt;13X&lt;/em&gt;&lt;/b&gt; speed-up for batch matmul computation, and a futher speed up with operator fusion enabled.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nmt-transformer/batch-matmul-bar-charts.png&quot; alt=&quot;image&quot; width=&quot;45%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;batch-matmul&quot;&gt;Batch Matmul&lt;/h2&gt;
+
+&lt;h3 id=&quot;why-batch-matmul&quot;&gt;Why batch matmul&lt;/h3&gt;
+&lt;p&gt;In Transformer, batch matmul is widely used in the computation of multi-head attention. Using batch matmul, multiple heads in the attention layer can run in parallel, which can help improve the computation efficiency of the hardware.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nmt-transformer/batchmatmul.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We conducted a thorough profiling of the Transformer model in the inference phase, and it is shown that batch matmul computation contribute up to ~ 30% of GPU kernel execution time. Using nvprof[2] to do some first-principle analysis of cuBLAS’s batch matmul kernel，it is clearly indicated that current implementation is quite under-performing and several interesting phenomena are observed.&lt;/p&gt;
+
+&lt;h3 id=&quot;what-is-batch-matmul&quot;&gt;What is batch matmul&lt;/h3&gt;
+&lt;p&gt;Typically, a batch matmul computation performs the matrix-matrix multiplication over a batch of matrices. The batch is considered to be “uniform”, i.e. all instances have the same dimensions (M, N, K), leading dimensions (lda, ldb, ldc) and transpositions for their respective A, B and C matrices.&lt;/p&gt;
+
+&lt;p&gt;Batch matmul computation can be described more concretely as follows:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;void BatchedGemm(input A, input B, output C, M, N, K, batch_dimension) {
+  for (int i = 0; i &amp;lt; batch_dimension; ++i)  {
+    DoGemm(A[i],B[i],C[i],M,K,N)
+  }
+}
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h4 id=&quot;batch-matmul-shapes&quot;&gt;Batch matmul shapes&lt;/h4&gt;
+
+&lt;p&gt;In the language translation tasks, shape of the batch matmul is significantly smaller than normal matmul computation in other workloads. The shape in Transformer is relevant to the length of input sentences and that of decoder steps. Normally, it is smaller than 30.&lt;/p&gt;
+
+&lt;p&gt;As to the batch dimension, it is a fixed number given a certain inference batch size. For instance, if 16 is used as batch size with beam size being 4, the batch dimension is 16 * 4 * #head (number of heads in multi-head attention, which is usually 8). The shape of the matrix M, K, N are within the range of  [1, max decode length] or [1, max encode length].&lt;/p&gt;
+
+&lt;h3 id=&quot;performance-issue-of-cublas-batch-matmul&quot;&gt;Performance issue of cuBLAS’ batch matmul&lt;/h3&gt;
+
+&lt;p&gt;Firstly, we make a theoretical FLOPs analysis over the batch matmul kernels. The results are quite interesting: all the batch matmul have limited computation intensity (less than 1 TFLOPs).&lt;/p&gt;
+
+&lt;p&gt;Then we profile the cuBLAS performance of batch matmul with multiple shapes through nvprof. The table below shows some of the metrics obtained on a NVIDIA M40 GPU with CUDA8.0.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;input shape &lt;br /&gt; [batch, M, N, K]&lt;/th&gt;
+      &lt;th&gt;kernel&lt;/th&gt;
+      &lt;th&gt;theoretical FLOPs&lt;/th&gt;
+      &lt;th&gt;nvprof observed FLOPs&lt;/th&gt;
+      &lt;th&gt;theoretical FLOPs / &lt;br /&gt; observed FLOPs&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 17, 17, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;18939904&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;0.87%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 1, 17, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;1114112&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;0.052%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 17, 1, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;1114112&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;0.052%&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[512, 30, 30, 128]&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt;&lt;/td&gt;
+      &lt;td&gt;58982400&lt;/td&gt;
+      &lt;td&gt;2155872256&lt;/td&gt;
+      &lt;td&gt;2.74%&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Even with different shapes (varing in M, N, K), all the &lt;strong&gt;maxwell_sgemmBatched_128x128_raggedMn_tn&lt;/strong&gt; calls execute the same amount of FLOPs, which is much bigger than the theoretical value. It can be inferred that all these different shapes may be padded to a certain shape. Among all these shapes, even in the best case, the theoretical FLOPs is still only 2.74% of the actually executed FLOPs, &lt;em&gt;therefore most of the computation is quite redundant [...]
+
+&lt;p&gt;&lt;b&gt;It is obvious that cuBLAS’ batch matmul implementation is far from efficiency. Thus we use TVM to generate efficient batch matmul kernels for our NMT workloads.&lt;/b&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;batch-matmul-computation&quot;&gt;Batch matmul computation&lt;/h2&gt;
+
+&lt;p&gt;In TVM, a general batch matmul computation can be declared as:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+A = tvm.placeholder((batch, M, K), name='A')
+B = tvm.placeholder((batch, K, N), name='B')
+k = tvm.reduce_axis((0, K), 'k')
+C = tvm.compute((batch, M, N),
+         lambda b, y, x: tvm.sum(A[b, y, k] * B[b, k, x], axis = k),
+         name = 'C')
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;schedule-optimization&quot;&gt;Schedule optimization&lt;/h2&gt;
+
+&lt;p&gt;After declaring the computation, we need to devise our own schedule carefully to squeeze performance potential.&lt;/p&gt;
+
+&lt;h3 id=&quot;tuning-parameters-of-blockthread-numbers&quot;&gt;Tuning parameters of block/thread numbers&lt;/h3&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;  # thread indices
+  block_y = tvm.thread_axis(&quot;blockIdx.y&quot;)
+  block_x = tvm.thread_axis(&quot;blockIdx.x&quot;)
+  thread_y = tvm.thread_axis((0, num_thread_y), &quot;threadIdx.y&quot;)
+  thread_x = tvm.thread_axis((0, num_thread_x), &quot;threadIdx.x&quot;)
+  thread_yz = tvm.thread_axis((0, vthread_y), &quot;vthread&quot;, name=&quot;vy&quot;)
+  thread_xz = tvm.thread_axis((0, vthread_x), &quot;vthread&quot;, name=&quot;vx&quot;)
+
+  # block partitioning
+  BB, FF, MM, PP = s[C].op.axis
+  BBFF = s[C].fuse(BB, FF)
+  MMPP = s[C].fuse(MM, PP)
+  by, ty_block = s[C].split(BBFF, factor = num_thread_y * vthread_y)
+  bx, tx_block = s[C].split(MMPP, factor = num_thread_x * vthread_x)
+  s[C].bind(by, block_y)
+  s[C].bind(bx, block_x)
+  vty, ty = s[C].split(ty_block, nparts = vthread_y)
+  vtx, tx = s[C].split(tx_block, nparts = vthread_x)
+  s[C].reorder(by, bx, vty, vtx, ty, tx)
+  s[C].reorder(by, bx, ty, tx)
+  s[C].bind(ty, thread_y)
+  s[C].bind(tx, thread_x)
+  s[C].bind(vty, thread_yz)
+  s[C].bind(vtx, thread_xz)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;We fuse the outer dimensions of the batch matmul, i.e. the BB and FF of the op’s dimension, normally known as “batch” dimension in batch matmul computation. Then we split the outer and the inner dimensions by a factor of (&lt;code class=&quot;highlighter-rouge&quot;&gt;number_thread * vthread&lt;/code&gt;).&lt;/p&gt;
+
+&lt;p&gt;Strided pattern is not needed in batch matmul, thus the virtual thread number (&lt;code class=&quot;highlighter-rouge&quot;&gt;vthread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;vthread_x&lt;/code&gt;) are both set to 1.&lt;/p&gt;
+
+&lt;h4 id=&quot;finding-the-best-combination-of-number_thread&quot;&gt;Finding the best combination of number_thread&lt;/h4&gt;
+
+&lt;p&gt;The results below are obtained on a NVIDIA M40 GPU device with CUDA8.0.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Input Shape [batch,features,M,N,K]&lt;/th&gt;
+      &lt;th&gt;num_thread_y, num_thread_x&lt;/th&gt;
+      &lt;th&gt;num_vthread_y, num_vthread_x&lt;/th&gt;
+      &lt;th&gt;Time(us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;8,1&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;37.62&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;4,1&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;39.30&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;1,1&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;38.82&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;1,1&lt;/td&gt;
+      &lt;td&gt;256,1&lt;/td&gt;
+      &lt;td&gt;41.95&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;[64,8,1,17,128]&lt;/td&gt;
+      &lt;td&gt;32,1&lt;/td&gt;
+      &lt;td&gt;1,1&lt;/td&gt;
+      &lt;td&gt;94.61&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;As learned from &lt;a href=&quot;http://tvmlang.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html&quot;&gt;past experience&lt;/a&gt;, the method to find the best combination of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is through brute-force search. After a brute-force search, the best combination for current shape can be f [...]
+
+&lt;h2 id=&quot;fuse-batch-matmul-with-other-operations&quot;&gt;Fuse batch matmul with other operations&lt;/h2&gt;
+
+&lt;p&gt;Normally, the existing “black-box” cuBLAS library calls play the role as the boundary of the normally used “op fusion” optimization tactics. However, with the generated efficient batch matmul kernel, the fusion boundary can be easily broken, more than just element-wise operations can be fused, thus futher performance improvement can be obtained.&lt;/p&gt;
+
+&lt;p&gt;It is observed from the the computation graph that a batch matmul is always followed by a &lt;em&gt;broadcast add&lt;/em&gt; operation or a &lt;em&gt;transpose&lt;/em&gt; operation. By fusing “add” or “transpose” operation with batch matmul, kernel launch overhead and redundant memory access time can be reduced.&lt;/p&gt;
+
+&lt;p&gt;Batch matmul and broadcast add fusion computation can be declared as follows:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+A = tvm.placeholder((batch_size, features, M, K), name='A')
+# the shape of B is (N, K) other than (K, N) is because B is transposed is this fusion pattern
+B = tvm.placeholder((batch_size, features, N, K), name='B')
+ENTER = tvm.placeholder((batch_size, 1, M, N), name = 'ENTER')
+k = tvm.reduce_axis((0, K), 'k')
+C = tvm.compute(
+           (batch_size, features, M, N),
+           lambda yb, yf, m, x: tvm.sum(A[yb, yf, m, k] * B[yb, yf, x, k], axis = k),
+           name = 'C')
+D = topi.broadcast_add(C, ENTER)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Batch matmul and transpose fusion computation can be declared as:&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+A = tvm.placeholder((batch_size, features, M, K), name='A')
+B = tvm.placeholder((batch_size, features, K, N), name='B')
+k = tvm.reduce_axis((0, K), 'k')
+C = tvm.compute(
+           (batch_size, M, features, N),
+           lambda yb, m, yf, x: tvm.sum(A[yb, yf, m, k] * B[yb, yf, k, x], axis = k),
+           name = 'C')
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;h3 id=&quot;fusion-kernel-performance&quot;&gt;Fusion Kernel Performance&lt;/h3&gt;
+
+&lt;p&gt;The shape of [batch=64, heads=8, M=1, N=17, K=128] is chosen to elaborate the performance of the generated code. 17 is chosen as the sequence length since it is the average input length in our production scenarios.&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;tf-r1.4 &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 513.9 us&lt;/li&gt;
+  &lt;li&gt;tf-r1.4 &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (separate): 541.9 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 37.62 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (fused): 38.39 us&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;The kernel fusion optimization brings a further &lt;b&gt;&lt;em&gt;1.7X&lt;/em&gt;&lt;/b&gt; speed-up.&lt;/p&gt;
+
+&lt;h2 id=&quot;integration-with-tensorflow&quot;&gt;Integration with Tensorflow&lt;/h2&gt;
+
+&lt;p&gt;The input shape of batch matmul in our workload is finite and can be enumerated easily in advance. With those pre-defined shapes, we can generate highly optimized CUDA kernel ahead of time (fixed shape computation could bring the best optimization potential). Meanwhile, a general batch matmul kernel suitable for most of the shapes will also be generated to provide a fall-back machanism for the shapes which does not have a corresponding ahead-of-time generated kernel.&lt;/p&gt;
+
+&lt;p&gt;The generated efficient kernels for specific shapes and the fall-back one are integrated into the Tensorflow framework. We develop fused ops, such as BatchMatMulTranspose or BatchMatMulAdd, to launch the specific generated kernel using TVM’s runtime API for certain input shape or invoke the fall-back kernel. A graph optimization pass is conducted to automatically replace the origin batch &lt;em&gt;matmul + add/transpose&lt;/em&gt; pattern with the fused ops. Meanwhile, by combin [...]
+
+&lt;h2 id=&quot;summary&quot;&gt;Summary&lt;/h2&gt;
+&lt;p&gt;Inside Alibaba, we found that TVM is a very productive tool to develop high performance GPU kernels to meet our in-house requirements. In this blog, NMT Transformer model is taken as an example to illustrate our optimization strategy with TVM. Firstly, we locate the hot-spot of Transformer model through first-principle analysis. Then we use TVM to generate highly optimized CUDA kernel to replace cuBLAS version (&lt;b&gt;&lt;em&gt;13X&lt;/em&gt;&lt;/b&gt; speed-up is observed). N [...]
+
+&lt;h2 id=&quot;resources&quot;&gt;Resources&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/Orion34C/tvm-batch-matmul-example/blob/master/tvm_batch_matmul_transpose_m1_kX.py&quot;&gt;TVM implementation of fused batch matmul + transpose computation&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
+&lt;p&gt;[1] &lt;a href=&quot;https://arxiv.org/pdf/1706.03762.pdf&quot;&gt;Attention is All You Need&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[2] &lt;a href=&quot;https://devblogs.nvidia.com/cuda-pro-tip-nvprof-your-handy-universal-gpu-profiler/&quot;&gt;nvprof is Your Handy Universal GPU Profiler&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;[3] &lt;a href=&quot;https://github.com/tensorflow/tensorflow/pull/16306&quot;&gt;Add Loop Invariant Node Motion Optimization in GraphOptimizer&lt;/a&gt;&lt;/p&gt;
+</description>
+                <link>https://tvm.apache.org/2018/03/23/nmt-transformer-optimize</link>
+                <guid>https://tvm.apache.org/2018/03/23/nmt-transformer-optimize</guid>
+                <pubDate>Fri, 23 Mar 2018 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>Compiling Deep Learning Models to WebGL with TVM</title>
+                <description>&lt;p&gt;Now TVM comes with a brand-new OpenGL/WebGL backend!
+This blog post explains what it is, and what you can achieve with it.&lt;/p&gt;
+
+&lt;h1 id=&quot;the-openglwebgl-backend&quot;&gt;The OpenGL/WebGL Backend&lt;/h1&gt;
+
+&lt;p&gt;TVM already targets a lot of backends covering a variety of platforms: CPU, GPU,
+mobile devices, etc… This time we are adding another backend: OpenGL/WebGL.&lt;/p&gt;
+
+&lt;p&gt;OpenGL/WebGL enables us to leverage the GPU on an environment which does not
+have CUDA installed. It is, for the time being, the only way of using the GPU
+inside a browser.&lt;/p&gt;
+
+&lt;p&gt;This new backend allows us to use OpenGL/WebGL in 3 different ways:&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;Local OpenGL&lt;/strong&gt;:
+We can compile a deep learning model into OpenGL and directly
+run it on the local machine, entirely using Python.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;WebGL with RPC&lt;/strong&gt;:
+We can compile a deep learning model into WebGL and export
+it as a shared library via Emscripten, with JavaScript host code and WebGL device code. Then
+we can deploy that library through RPC onto a TVM JavaScript runtime system
+running inside a browser.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;WebGL with static library&lt;/strong&gt;:
+We can compile a deep learning model into WebGL,
+link it with the TVM JavaScript runtime system and export the entire package.
+Then we can run the model in a web page on a browser, with no dependency.
+The detailed flow is described in figure 1.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;We rely on Emscripten and its fastcomp LLVM backend to generate the javascript backend.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opengl/webgl-flow.png&quot; alt=&quot;image&quot; width=&quot;65%&quot; /&gt;&lt;br /&gt;
+Figure 1&lt;/p&gt;
+
+&lt;p&gt;See &lt;a href=&quot;https://github.com/dmlc/nnvm/blob/master/tutorials/from_mxnet_to_webgl.py&quot;&gt;here&lt;/a&gt;
+for examples of all three of them.&lt;/p&gt;
+
+&lt;h1 id=&quot;how-is-this-different-from-x&quot;&gt;How is this Different from X?&lt;/h1&gt;
+
+&lt;p&gt;Running a neural network on a browser isn’t an entirely new thing.
+Andrej Karpathy’s &lt;a href=&quot;https://cs.stanford.edu/people/karpathy/convnetjs/&quot;&gt;ConvNetJS&lt;/a&gt;
+and Google’s &lt;a href=&quot;https://deeplearnjs.org/&quot;&gt;DeepLearning.JS&lt;/a&gt; are examples of that.&lt;/p&gt;
+
+&lt;p&gt;So what’s unique about TVM with WebGL? The big difference is that the op kernels
+in TVM are automatically compiled, not handwritten. As shown in Figure 2, TVM
+utilizes a unified AST to define kernels, and compiles it to code on different
+platforms.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opengl/comparison.png&quot; alt=&quot;&quot; width=&quot;50%&quot; /&gt;&lt;br /&gt;
+Figure 2&lt;/p&gt;
+
+&lt;p&gt;This means that:&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;To deploy your existing model to WebGL, you don’t need to write a lot of
+additional code. The NNVM/TVM model definition is the same for all targets, so
+you just need to compile it to a new target.&lt;/li&gt;
+  &lt;li&gt;To add a new op kernel, you only need to define it in TVM once, instead of
+implementing it once for every target. You don’t need to know how to write
+GLSL code to add a new op kernel to WebGL!&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;benchmark&quot;&gt;Benchmark&lt;/h1&gt;
+
+&lt;p&gt;Here we perform a benchmark for a typical workload: image classification using
+resnet18.&lt;/p&gt;
+
+&lt;p&gt;I’m using my &lt;a href=&quot;https://www.asus.com/us/Laptops/N76VZ/&quot;&gt;5-year-old laptop&lt;/a&gt; which
+has an 8-core Intel® Core™ i7-3610QM, and a GTX650M.&lt;/p&gt;
+
+&lt;p&gt;In this benchmark, we download a resnet18 model from the Gluon model zoo, and
+perform end-to-end classification on a cat image. We only measure the model
+execution time (without model/input/parameter loading), and each model is run
+100 times to get an average. The results are shown in figure 3.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opengl/opengl-benchmark.png&quot; alt=&quot;image&quot; /&gt;&lt;br /&gt;
+Figure 3&lt;/p&gt;
+
+&lt;p&gt;The benchmark is run in 4 different settings:&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;strong&gt;CPU (LLVM)&lt;/strong&gt;: The model is compiled into LLVM IR and JIT’ed. Therefore, it is
+run entirely on the CPU.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;OpenCL&lt;/strong&gt;: The model is compiled into OpenCL. There is still some glue code
+compiled to LLVM, which is responsible for setting up and launching OpenCL
+kernels. Then we run it on the local machine.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;OpenGL&lt;/strong&gt;: Same as OpenCL, but compiled to OpenGL.&lt;/li&gt;
+  &lt;li&gt;&lt;strong&gt;WebGL&lt;/strong&gt;: The glue code is compiled to LLVM, and transformed to JavaScript using
+Emscripten’s Fastcomp LLVM backend.
+The device code is compiled to WebGL. We execute the model in Firefox.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;From the result above we can observe that, the TVM OpenGL backend has a similar
+performance as OpenCL. More interestingly, the WebGL version inside the browser
+isn’t significantly slower than desktop OpenGL. Considering that the host code
+is JavaScript, this is quite surprising. This might be due to the fact that
+Emscripten generates &lt;a href=&quot;http://asmjs.org/&quot;&gt;asm.js&lt;/a&gt; which enables dramatic
+optimizations in Firefox.&lt;/p&gt;
+
+&lt;p&gt;This is a first step toward automatic compilation of deep learning models
+into web browser. We expect more performance improvements as we bring
+optimizations into the TVM stack.&lt;/p&gt;
+
+&lt;h2 id=&quot;show-me-the-code&quot;&gt;Show me the Code&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;Checkout &lt;a href=&quot;https://github.com/dmlc/nnvm/blob/master/tutorials/from_mxnet_to_webgl.py&quot;&gt;this complete code example&lt;/a&gt;.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;acknowledgement&quot;&gt;Acknowledgement&lt;/h2&gt;
+&lt;p&gt;We thank the developers of Emscripten for providing the fastcomp toolchain and the helps during the development.&lt;/p&gt;
+</description>
+                <link>https://tvm.apache.org/2018/03/12/webgl</link>
+                <guid>https://tvm.apache.org/2018/03/12/webgl</guid>
+                <pubDate>Mon, 12 Mar 2018 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>Optimizing Mobile Deep Learning on ARM GPU with TVM</title>
+                <description>&lt;p&gt;With the great success of deep learning, the demand for
+deploying deep neural networks to mobile devices is growing rapidly.
+Similar to what we do in desktop platforms, utilizing GPU in mobile devices
+can benefit both inference speed and energy efficiency. However, most
+existing deep learning frameworks do not support mobile GPU very well.
+The difficulty lies at the difference between mobile GPU architecture and
+desktop GPU architecture. It means special effort is required for optimizing on
+mobile GPU. The non-trivial extra work eventually results in the poor support
+of mobile GPU in most deep learning frameworks.&lt;/p&gt;
+
+&lt;p&gt;TVM addresses the difficulty of deploying for different hardwares by
+introducing an unified IR stack, with which the optimization for different
+hardwares can be done easily.  In this post, we show how we use
+&lt;a href=&quot;http://tvmlang.org/2017/08/17/tvm-release-announcement.html&quot;&gt;TVM&lt;/a&gt;/&lt;a href=&quot;http://tvmlang.org/2017/10/06/nnvm-compiler-announcement.html&quot;&gt;NNVM&lt;/a&gt; to
+generate efficient kernels for ARM Mali GPU and do end-to-end compilation.
+In our test on Mali-T860 MP4, compared with
+&lt;a href=&quot;https://developer.arm.com/technologies/compute-library&quot;&gt;Arm Compute Library&lt;/a&gt;,
+our method is 1.4x faster on VGG-16 and 2.2x faster on MobileNet.
+Both graph-level and operator-level optimization contribute
+to this speed up.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opt-mali/end2end.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;center&gt; Figure. Inference Speed of Different Backends on ImageNet&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;h1 id=&quot;mali-midgrad-gpu&quot;&gt;Mali Midgrad GPU&lt;/h1&gt;
+&lt;p&gt;We will use Firefly-RK3399 with Mali-T860 MP4 as our test environment,
+so we mainly focus on Mali T8xx below.&lt;/p&gt;
+
+&lt;h2 id=&quot;architecture&quot;&gt;Architecture&lt;/h2&gt;
+&lt;p&gt;Figure 1 is an overview of the Mali Architecture on T860 and T880.
+The GPUs are scalable up to 16 coherent shader cores. Inside each
+shader core, there are 2 or 3 arithmetic pipelines, 1 load/store pipeline
+and 1 texture pipeline (so-called TriPipe). The ALU in each arithmetic
+pipeline has four 128-bit vector units and one scalar units.&lt;/p&gt;
+
+&lt;p&gt;We use OpenCL for GPU computing. When mapping to OpenCL model, each
+shader core executes one or several work groups. Each shader core supports
+up to 384 concurrently executing threads. Each work item in OpenCL
+typically maps to a single thread on a Mali GPU.
+The Mali GPUs use a VLIW (Very Long Instruction Word) architecture.
+Each instruction word contains multiple operations. The Mali GPUs
+also use SIMD, so that most arithmetic instructions operate on
+multiple data elements simultaneously. &lt;sup&gt;[1]&lt;/sup&gt;&lt;/p&gt;
+
+&lt;center&gt; &lt;img width=&quot;50%&quot; src=&quot;/images/opt-mali/mali-arch.png&quot; /&gt; &lt;/center&gt;
+&lt;center&gt; Figure 1. Mali T860 and T880 (source &lt;sup&gt;[2]&lt;/sup&gt;) &lt;/center&gt;
+
+&lt;h2 id=&quot;difference-with-nvidias-gpus&quot;&gt;Difference with NVIDIA’s GPUs&lt;/h2&gt;
+&lt;p&gt;Here are some differences that we should concern when writing OpenCL
+code for Mali GPUs, compared with writing for NVIDIA’s GPUs.&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;Mali GPUs use an unified global memory. In NVIDIA’s GPUs, we usually
+copy data to shared memory, because NVIDIA’s GPUs have physically
+separate global memory, shared memory and register. In Mali, this copy
+does not improve performance and can be removed. Besides, Mali GPUs
+usually share the global memory with CPU, so there is no need for copying
+between CPU and GPU.&lt;/li&gt;
+  &lt;li&gt;Mali Midgrad GPUs are based on SIMD (Single Instruction Multiple Data)
+and need explicit vectorization. In NVIDIA CUDA, parallelism is
+achieved by SIMT (Single Instruction Multiple Thread), which does
+not require explicit vectorization. But also notice that the newer
+Mali Bitfrost GPUs are based on quad-style vectorization and does not
+require explicit vectorization.&lt;/li&gt;
+  &lt;li&gt;All threads in Mali GPUs have individual program counters. It means
+the &lt;code class=&quot;highlighter-rouge&quot;&gt;warp size&lt;/code&gt; is 1, so that branch divergence is not a major problem.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;optimization--convolution-as-example&quot;&gt;Optimization : Convolution as Example&lt;/h1&gt;
+&lt;p&gt;The convolution layer is the core of most deep neural networks and
+takes most of the computation time. So we take the convolution layer
+as example to demonstrate how common optimization techniques like
+packing, tiling, unrolling and vectorization are applied in TVM.&lt;/p&gt;
+
+&lt;h2 id=&quot;im2col-with-gemm&quot;&gt;Im2Col with GEMM&lt;/h2&gt;
+&lt;p&gt;A well-known algorithm for convolution layer is &lt;a href=&quot;https://petewarden.com/2015/04/20/why-gemm-is-at-the-heart-of-deep-learning/&quot;&gt;im2col&lt;/a&gt;,
+which converts the little 3D input cubes to columns of a matrix and
+perform a GEMM. The advantage of this method is easy utilization of
+highly optimized BLAS library.  However, the memory redundancy
+(9x memory for 3x3 kernel) is awful.&lt;/p&gt;
+
+&lt;h2 id=&quot;spatial-packing&quot;&gt;Spatial Packing&lt;/h2&gt;
+&lt;p&gt;Instead, we adopt a method to calculate the convolution, and apply the
+optimization techniques step by step. A convolution layer in VGG-16
+is used as tuning case, whose configuration is listed below.
+We assume the batch size is 1 for inference.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Input Shape&lt;/th&gt;
+      &lt;th&gt;Output Shape&lt;/th&gt;
+      &lt;th&gt;Kernel Size&lt;/th&gt;
+      &lt;th&gt;Stride&lt;/th&gt;
+      &lt;th&gt;Padding&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;56x56x256&lt;/td&gt;
+      &lt;td&gt;56x56x256&lt;/td&gt;
+      &lt;td&gt;3x3&lt;/td&gt;
+      &lt;td&gt;(1, 1)&lt;/td&gt;
+      &lt;td&gt;(1, 1)&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;As a baseline, we also list the performance of this layer in
+Arm Compute Library.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;declare-the-computation-tiling-and-packing&quot;&gt;Declare the computation: tiling and packing&lt;/h3&gt;
+&lt;p&gt;Tiling and packing are two methods intended for better memory access.
+Tiling separates the whole computation into small blocks for better
+datareuse.  Packing re-layouts the input matrices according to the
+tiling so that we can access the memory sequentially, which reduces
+cache miss rate.&lt;/p&gt;
+
+&lt;p&gt;We do tiling on the width dimension of the input image and CO dimension
+of the filter matrix.  This is described by &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.compute&lt;/code&gt;.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tiling factor
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;VW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# get input shape
+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;TH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;H_PAD&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;TW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;W_PAD&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# calc output shape
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;OH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;H_PAD&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;n&quot;&gt;OW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;W_PAD&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt;&lt;span class=&qu [...]
+
+&lt;span class=&quot;c1&quot;&gt;# data shape after packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dvshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;TH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;s [...]
+
+&lt;span class=&quot;c1&quot;&gt;# kernel shape after packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kvshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&l [...]
+
+&lt;span class=&quot;n&quot;&gt;ovshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OH&lt;/span&gt; &lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;oshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OW&lt;/span&gt;&lt;span class=&quo [...]
+
+&lt;span class=&quot;c1&quot;&gt;# define packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&l [...]
+    &lt;span class=&quot;n&quot;&gt;data_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&q [...]
+
+&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt [...]
+    &lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&q [...]
+
+&lt;span class=&quot;c1&quot;&gt;# define convolution
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt [...]
+&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ovshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;sp [...]
+    &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+            &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt [...]
+            &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;sp [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unpack to correct layout
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;oshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/ [...]
+                     &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;//&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt [...]
+                     &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'output_unpack'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tag&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'direct_conv_output'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We can inspect the defined IR by&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;I pick the convolution part here.&lt;/p&gt;
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;produce conv {
+  for (co, 0, 64) {
+    for (h, 0, 56) {
+      for (w, 0, 14) {
+        for (vw.init, 0, 4) {
+          for (vc.init, 0, 4) {
+            conv[((((((((co*56) + h)*14) + w)*4) + vw.init)*4) + vc.init)] = 0.000000f
+          }
+        }
+        for (ci, 0, 256) {
+          for (kh, 0, 3) {
+            for (kw, 0, 3) {
+              for (vw, 0, 4) {
+                for (vc, 0, 4) {
+                  conv[((((((((co*56) + h)*14) + w)*4) + vw)*4) + vc)] = (conv[((((((((co*56) + h)*14) + w)*4) + vw)*4) + vc)] + (data_vec[(((((((((h*14) + w)*256) + ci)*3) + kh)*6) + kw) + vw)]*kernel_vec[((((((((co*256) + ci)*3) + kh)*3) + kw)*4) + vc)]))
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h3 id=&quot;kernel-1-bind-thread&quot;&gt;Kernel 1: bind thread&lt;/h3&gt;
+&lt;p&gt;In TVM, we declare the computation at first and then &lt;em&gt;schedule&lt;/em&gt; it.
+This mechanism decouples the algorithm and implementation detail. (This idea
+is from &lt;a href=&quot;http://halide-lang.org/&quot;&gt;Halide&lt;/a&gt;).&lt;/p&gt;
+
+&lt;p&gt;The following schedule simply binds axes to GPU threads, so that
+our code can run on Mali GPU.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# helper function for binding thread
+&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/s [...]
+    &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot; tile and bind 3d &quot;&quot;&quot;&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;or&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z_factor&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x_factor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x_factor&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;or&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+
+&lt;span class=&quot;c1&quot;&gt;# set tunable parameter
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule data packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+
+&lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
+
+&lt;span class=&quot;c1&quot;&gt;# schedule conv
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
+
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;With this schedule, our code can run now, but the performance is terrible.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+      &lt;th&gt;speedup&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+      &lt;td&gt;1x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 1: simple bind&lt;/td&gt;
+      &lt;td&gt;5.6154&lt;/td&gt;
+      &lt;td&gt;0.6588&lt;/td&gt;
+      &lt;td&gt;0.03x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;kernel-2-unrolling&quot;&gt;Kernel 2: unrolling&lt;/h3&gt;
+&lt;p&gt;Loop unrolling can reduce the instructions for loop control, reduce
+branch penalties and hide latency in reading memory.
+In TVM, this can be done easily by calling &lt;code class=&quot;highlighter-rouge&quot;&gt;s.unroll(axis)&lt;/code&gt;&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tunable parameter
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule data packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
+
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule conv
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
+
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+      &lt;th&gt;speedup&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+      &lt;td&gt;1x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 1: simple bind&lt;/td&gt;
+      &lt;td&gt;5.6154&lt;/td&gt;
+      &lt;td&gt;0.6588&lt;/td&gt;
+      &lt;td&gt;0.03x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 2: + unrolling&lt;/td&gt;
+      &lt;td&gt;0.3707&lt;/td&gt;
+      &lt;td&gt;9.9796&lt;/td&gt;
+      &lt;td&gt;0.49x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;kernel3-vectorization&quot;&gt;Kernel3: vectorization&lt;/h3&gt;
+&lt;p&gt;As mentioned before, we need to do vectorization explictly
+ in order to achieve the best performance on Mali GPU.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tunable parameter
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule data packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unroll
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unroll
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! VECTORIZE HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;c1&quot;&gt;# schedule conv
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
+
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+
+&lt;span class=&quot;c1&quot;&gt;# unroll
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! VECTORIZE HERE !!&quot;&quot;&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;Kernel&lt;/th&gt;
+      &lt;th&gt;Cost (second)&lt;/th&gt;
+      &lt;th&gt;GFLOPS&lt;/th&gt;
+      &lt;th&gt;speedup&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;GEMM method in ARMComputeLib&lt;/td&gt;
+      &lt;td&gt;0.1821&lt;/td&gt;
+      &lt;td&gt;20.3111&lt;/td&gt;
+      &lt;td&gt;1x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 1: simple bind&lt;/td&gt;
+      &lt;td&gt;5.6154&lt;/td&gt;
+      &lt;td&gt;0.6588&lt;/td&gt;
+      &lt;td&gt;0.03x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 2: + unrolling&lt;/td&gt;
+      &lt;td&gt;0.3707&lt;/td&gt;
+      &lt;td&gt;9.9796&lt;/td&gt;
+      &lt;td&gt;0.49x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;Kernel 3: + vectorization&lt;/td&gt;
+      &lt;td&gt;0.1304&lt;/td&gt;
+      &lt;td&gt;28.3679&lt;/td&gt;
+      &lt;td&gt;1.40x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h3 id=&quot;how-to-set-the-tunable-parameter&quot;&gt;How to set the tunable parameter&lt;/h3&gt;
+&lt;p&gt;As for the tunable parameters above, some can be calculated.
+For the vectorized dimension &lt;code class=&quot;highlighter-rouge&quot;&gt;VC&lt;/code&gt;, we should fill the 128-bit register,
+so it can be set as 128/32=4 for float32 and 128/16=8 for float16.&lt;/p&gt;
+
+&lt;p&gt;But more often we cannot determine the optimal value, due to the
+complicated runtime. We use grid search in TVM. It can be
+done extremely effective since we write python code in TVM’s high-level
+IR rather than direct OpenCL code.&lt;/p&gt;
+
+&lt;h3 id=&quot;the-generated-opencl-code&quot;&gt;The generated OpenCL code&lt;/h3&gt;
+&lt;p&gt;We can view the generated OpenCL code by&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/sp [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;The OpenCL code is too long to be pasted here, and it is hard to read due
+to heavy unrolling. If interested, you can view it
+&lt;a href=&quot;https://github.com/merrymercy/tvm-mali/blob/master/data/kernels.cl&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h1 id=&quot;end-to-end-benchmarking&quot;&gt;End-to-End Benchmarking&lt;/h1&gt;
+&lt;p&gt;In this section, we compare the comprehensive performance between
+different backends on some popular deep neural networks.
+Our test environment is&lt;/p&gt;
+
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Firefly-RK3399 4G
+CPU: dual-core Cortex-A72 + quad-core Cortex-A53
+GPU: Mali-T860MP4
+
+Arm Compute Library : v17.12
+MXNet: v1.0.1
+Openblas: v0.2.18
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We use NNVM and TVM to do end-to-end compilation.&lt;/p&gt;
+
+&lt;h2 id=&quot;performance&quot;&gt;Performance&lt;/h2&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/opt-mali/end2end.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;center&gt; Figure 2. Inference Speed of Different Backends on ImageNet&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;As shown in Figure 2, we test the inference speed on ImageNet.
+On Firefly-RK3399, Mali GPU can be 2x ~ 4x faster than 6-core big.LITTLE CPU.
+Our end-to-end pipeline is 1.4x ~ 2.2x faster than Arm Compute Library.
+We try both GEMM and direct method of convolution layer in
+Arm Compute Library, GEMM method is always faster than direct method
+in these test cases, so we only plot the result of GEMM method.&lt;/p&gt;
+
+&lt;p&gt;Some results, like resnet18 on Arm Compute Library, are missing in the Figure 2.
+It is because the graph runtime of Arm Compute Library does not support
+skip connection currently and has a poor neon implementation of
+depthwise convolution.  This also reflects the advantage of NNVM
+software stack.&lt;/p&gt;
+
+&lt;h2 id=&quot;half-precision-performance&quot;&gt;Half-Precision Performance&lt;/h2&gt;
+&lt;p&gt;Precision in deep neural networks is not very important, especially
+for the inference on mobile devices. Using low-precision arithmetic
+can make the inference much faster. We also test the half-precision
+floating number on Mali GPU.&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th&gt;model&lt;/th&gt;
+      &lt;th&gt;backend&lt;/th&gt;
+      &lt;th&gt;Time Cost per Image (second)&lt;/th&gt;
+      &lt;th&gt;speed up to FP32&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td&gt;vgg16&lt;/td&gt;
+      &lt;td&gt;ACM-mali&lt;/td&gt;
+      &lt;td&gt;0.9694&lt;/td&gt;
+      &lt;td&gt;1.69&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;vgg16&lt;/td&gt;
+      &lt;td&gt;TVM-mali&lt;/td&gt;
+      &lt;td&gt;0.6896&lt;/td&gt;
+      &lt;td&gt;&lt;strong&gt;1.87x&lt;/strong&gt;&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;MobileNet 1.0&lt;/td&gt;
+      &lt;td&gt;TVM-mali&lt;/td&gt;
+      &lt;td&gt;0.0479&lt;/td&gt;
+      &lt;td&gt;1.60x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td&gt;ResNet18&lt;/td&gt;
+      &lt;td&gt;TVM-mali&lt;/td&gt;
+      &lt;td&gt;0.1183&lt;/td&gt;
+      &lt;td&gt;1.73x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;center&gt; Table 1. Inference Speed of FP16 on ImageNet&lt;/center&gt;
+&lt;p&gt;&lt;/p&gt;
+
+&lt;p&gt;In theory, FP16 can both double peak compute and halve memory consumption,
+so that doubling the speed. But it needs good input shape for
+longer vectorization and fine-tuning some parameters.&lt;/p&gt;
+
+&lt;h2 id=&quot;further-work-on-mobile-devices&quot;&gt;Further Work on Mobile Devices&lt;/h2&gt;
+&lt;p&gt;We should admit that there is still some room for improvement,
+mainly at the graph level, such as model compression and weight prelayout.
+Further improvement in NNVM will try to solve these problems.&lt;/p&gt;
+
+&lt;h1 id=&quot;show-me-the-code&quot;&gt;Show me the code&lt;/h1&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/merrymercy/tvm-mali&quot;&gt;End-to-End benchmark&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/topi/python/topi/mali&quot;&gt;Convolution and Depthwise Convolution Schedule&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;bio--acknowledgement&quot;&gt;Bio &amp;amp; Acknowledgement&lt;/h1&gt;
+&lt;p&gt;&lt;a href=&quot;https://lmzheng.net&quot;&gt;Lianmin Zheng&lt;/a&gt; is an undergraduate
+student at SJTU Apex lab.  He is interested in machine learning
+and building computer system.&lt;/p&gt;
+
+&lt;p&gt;The author has many thanks to
+&lt;a href=&quot;https://homes.cs.washington.edu/~tqchen/&quot;&gt;Tianqi Chen&lt;/a&gt; for his helpful
+advice and &lt;a href=&quot;https://github.com/yzhliu&quot;&gt;Yizhi Liu&lt;/a&gt; for his earlier work.&lt;/p&gt;
+
+&lt;h1 id=&quot;reference&quot;&gt;Reference&lt;/h1&gt;
+&lt;p&gt;[1] &lt;a href=&quot;https://developer.arm.com/docs/100614/0302&quot;&gt;ARM Mali GPU OpenCL Developer Guide&lt;/a&gt;
+[2] &lt;a href=&quot;https://developer.arm.com/&quot;&gt;ARM Developer&lt;/a&gt;&lt;/p&gt;
+</description>
+                <link>https://tvm.apache.org/2018/01/16/opt-mali-gpu</link>
+                <guid>https://tvm.apache.org/2018/01/16/opt-mali-gpu</guid>
+                <pubDate>Tue, 16 Jan 2018 00:00:00 -0800</pubDate>
+        </item>
+
+        <item>
+                <title>Remote Profile and Test Deep Learning Cross Compilation on Mobile Phones with TVM RPC</title>
+                <description>&lt;p&gt;TVM stack is an end to end compilation stack to deploy deep learning workloads to all hardware backends.
+Thanks to the NNVM compiler support of TVM stack, we can now directly compile descriptions from deep learning frameworks and compile them to bare metal code.
+An impressive feature OF tvm is its ability to deploy computation workloads on different platforms, such as GPUs and mobile phones (will support more hardward backends).&lt;/p&gt;
+
+&lt;p&gt;However, when we want to test and profile cross compilation, it is hard to test different computation workloads on a heterogeneous device such as raspberry pi or a mobile phone.
+In order to optimize a computation task, one has to edit the code on the development PC, compile, deploy to the device, test, then modify the codes again to see whether it accelerates. The workflow looks like,&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/flow1.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Is there any way to speed up this process?&lt;/p&gt;
+
+&lt;p&gt;Today we introduce an approach to deploy and test TVM workloads on Android Phones. We develop a TVM runtime for Java and build an Android APP upon it. The Android APP takes shared library as input and runs compiled functions on the mobile phone. Thus our workflow simplifies to,&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/flow2.png&quot; alt=&quot;image&quot; width=&quot;50%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;With the help of the TVM RPC, one can build TVM functions and NDArrays on a remote device. The ability to cross-compile to different platforms makes it easy to develop on one platform and test on another.&lt;/p&gt;
+
+&lt;p&gt;The process is illustrated as following:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/arch.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;run-tvm-app-on-android-phone&quot;&gt;Run TVM APP on Android Phone&lt;/h2&gt;
+
+&lt;p&gt;You can find Android RPC APP in &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/android_rpc&quot;&gt;apps/android_rpc&lt;/a&gt;. Please follow the instruction to build for your Android device. Once the APK is built, sign it using &lt;code class=&quot;highlighter-rouge&quot;&gt;apps/android_rpc/dev_tools&lt;/code&gt; and install it on the phone. The APP looks like:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/app.png&quot; alt=&quot;image&quot; width=&quot;25%&quot; /&gt;
+&lt;img src=&quot;/images/android_rpc/app_error.png&quot; alt=&quot;image&quot; width=&quot;25%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Usually we cannot start a standalone server on mobile phone, instead we start an proxy server and use our app to connect.&lt;/p&gt;
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;python &lt;span class=&quot;nt&quot;&gt;-m&lt;/span&gt; tvm.exec.rpc_proxy
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;create-ndarray-on-the-phone&quot;&gt;Create NDArray on the Phone&lt;/h2&gt;
+
+&lt;p&gt;Now we can connect to the proxy server from the laptop:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;9090&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;This will give us a handler &lt;code class=&quot;highlighter-rouge&quot;&gt;remote&lt;/code&gt; which we can use to communicate with the mobile phone. For instance, the following lines create a 1024x1024 matrix on phone’s GPU:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span [...]
+	&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;&lt;span cla [...]
+	&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;When &lt;code class=&quot;highlighter-rouge&quot;&gt;A.asnumpy()&lt;/code&gt; is called from the laptop, the matrix &lt;code class=&quot;highlighter-rouge&quot;&gt;A &lt;/code&gt;will be copied to phone’s RAM and then transfer to the laptop through the proxy server. The TVM RPC interface is transparent to users.&lt;/p&gt;
+
+&lt;h2 id=&quot;gemm-matrix-multiplication-on-the-phone&quot;&gt;GEMM (Matrix Multiplication) on the Phone&lt;/h2&gt;
+
+&lt;p&gt;Now we are going to introduce how to test matrix multiplication on an Android phone. First let’s define the very simple GEMM schedule:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;gemm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
+    &lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
+    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
+
+    &lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+        &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ii&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;jj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span  [...]
+        &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'C'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;threadIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;bo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;to&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=&q [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class=& [...]
+
+    &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&q [...]
+
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+    	&lt;span class=&quot;s&quot;&gt;&quot;opencl&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
+    	&lt;span class=&quot;n&quot;&gt;target_host&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;llvm -target=arm64-linux-android&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
+    	&lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;There’s nothing special except the last line. Here we set the target to ‘opencl’ since this is the computation language which our Mali GPU supports. Note that we set &lt;code class=&quot;highlighter-rouge&quot;&gt;target_host&lt;/code&gt; to ‘&lt;code class=&quot;highlighter-rouge&quot;&gt;llvm -target=arm64-linux-android&lt;/code&gt;’, it depends on what architecture your Android Phone is. We tested on Samsung Galaxy S6 Edge, which has a Mali-T760 GPU. Here is the CPU info for  [...]
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nv&quot;&gt;$ &lt;/span&gt;adb shell
+shell@zenltechn:/ &lt;span class=&quot;nv&quot;&gt;$ &lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;cat&lt;/span&gt; /proc/cpuinfo
+Processor	: AArch64 Processor rev 2 &lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;aarch64&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt;
+processor	: 0
+processor	: 1
+processor	: 2
+processor	: 3
+processor	: 4
+processor	: 5
+processor	: 6
+processor	: 7
+Features	: fp asimd aes pmull sha1 sha2 crc32
+CPU implementer	: 0x41
+CPU architecture: AArch64
+CPU variant	: 0x0
+CPU part	: 0xd03
+CPU revision	: 2
+
+Hardware	: SAMSUNG Exynos7420
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Please refer to &lt;a href=&quot;https://clang.llvm.org/docs/CrossCompilation.html#target-triple&quot;&gt;target triple&lt;/a&gt; to learn the compile options for LLVM.&lt;/p&gt;
+
+&lt;p&gt;We use &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.contrib.ndk&lt;/code&gt; to build the shared library for the Android system,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;util&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;, [...]
+&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gemm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bn&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;256&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;util&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tempdir&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relpath&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;export_library&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ndk&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_shared&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;ndk.create_shared&lt;/code&gt; reads the environment variable &lt;code class=&quot;highlighter-rouge&quot;&gt;TVM_NDK_CC&lt;/code&gt; to find the compiler &amp;amp; linker for the Android device. We can easily use NDK to generate standalone toolchain for our device. For example, the following commands generate standalone compilers and linkers for ARM64 Android devices.&lt;/p&gt;
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nb&quot;&gt;cd&lt;/span&gt; /opt/android-ndk/build/tools/
+./make-standalone-toolchain.sh &lt;span class=&quot;nt&quot;&gt;--platform&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;android-24 &lt;span class=&quot;nt&quot;&gt;--use-llvm&lt;/span&gt; &lt;span class=&quot;nt&quot;&gt;--arch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;arm64 &lt;span class=&quot;nt&quot;&gt;--install-dir&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;/opt/android-toolchain-arm64
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;If everything goes right, we’ve got a shared library ‘gemm_gpu.so’. Now let’s upload it to the mobile phone, make the phone load the module and get a remote handler,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&qu [...]
+
+&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;upload&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Create the remote arrays and print the running time,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;sp [...]
+
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;numpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
+
+&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+
+&lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;time_evaluator&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;entry_name&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &l [...]
+&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;span class=&quot [...]
+&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;%&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;g secs/op, &lt;/span&gt;&lt;span class=&quot;si&quot;&gt;%&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;g GFLOPS'&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;%&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt;&lt [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Now we can verify the results on PC,&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_almost_equal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;decimal&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;In the case above, we develop and cross-compile to a binary file for our mobile phone. Through the proxy server, the binary is uploaded to the phone and run in its JVM. This approach makes it easy to develop and test different computation workloads on Android.&lt;/p&gt;
+
+&lt;h2 id=&quot;java-runtime-for-tvm&quot;&gt;Java Runtime for TVM&lt;/h2&gt;
+
+&lt;p&gt;The Android APP is built on top of the Java runtime, which provides minimum supports for TVM Function and NDArray. Here’s an example for registering function in tvm4j,&lt;/p&gt;
+
+&lt;div class=&quot;language-java highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nc&quot;&gt;Function&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;func&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;Function&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;convertFunc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt; [...]
+      &lt;span class=&quot;nd&quot;&gt;@Override&lt;/span&gt; &lt;span class=&quot;kd&quot;&gt;public&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;Object&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;invoke&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;TVMValue&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot [...]
+        &lt;span class=&quot;nc&quot;&gt;StringBuilder&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;new&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;StringBuilder&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;TVMValue&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;append&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;asString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;());&lt;/span&gt;
+        &lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;toString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+      &lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;o&quot;&gt;});&lt;/span&gt;
+&lt;span class=&quot;nc&quot;&gt;TVMValue&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushArg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;Hello&quot;&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushA [...]
+&lt;span class=&quot;n&quot;&gt;assertEquals&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;Hello World!&quot;&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;asString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;());&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;As we have seen in the GEMM part, one can build shared library by Python and execute it by Java,&lt;/p&gt;
+
+&lt;div class=&quot;language-java highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;ml.dmlc.tvm.Module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;ml.dmlc.tvm.NDArray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;ml.dmlc.tvm.TVMContext&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;java.io.File&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;java.util.Arrays&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;;&lt;/span&gt;
+
+&lt;span class=&quot;kd&quot;&gt;public&lt;/span&gt; &lt;span class=&quot;kd&quot;&gt;class&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;LoadAddFunc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;kd&quot;&gt;public&lt;/span&gt; &lt;span class=&quot;kd&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;main&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;String&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;{&lt;/span&gt;
+    &lt;span class=&quot;nc&quot;&gt;String&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;loadingDir&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;];&lt;/span&gt;
+    &lt;span class=&quot;nc&quot;&gt;Module&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;fadd&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;Module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;load&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;loadingDir&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;File&lt; [...]
+
+    &lt;span class=&quot;nc&quot;&gt;TVMContext&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;TVMContext&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;cpu&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+
+    &lt;span class=&quot;kt&quot;&gt;long&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;new&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;long&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;};&lt;/span&gt;
+    &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span& [...]
+    &lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;copyFrom&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;new&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;[]{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+    &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nc&quot;&gt;NDArray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ctx&lt;/span& [...]
+
+    &lt;span class=&quot;n&quot;&gt;fadd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;entryFunc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;().&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushArg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;pushArg&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span& [...]
+    &lt;span class=&quot;nc&quot;&gt;System&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;out&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;println&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nc&quot;&gt;Arrays&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;toString&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;(&lt;/span&g [...]
+
+    &lt;span class=&quot;n&quot;&gt;arr&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fadd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;na&quot;&gt;release&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;();&lt;/span&gt;
+  &lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;o&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Once you have built TVM library following the &lt;a href=&quot;http://docs.tvmlang.org/how_to/install.html&quot;&gt;Installation Guide&lt;/a&gt;, run&lt;/p&gt;
+
+&lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;make jvmpkg
+make jvminstall
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;This will compile, package and install tvm4j in your local maven repository. Please refer to &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/jvm&quot;&gt;tvm4j&lt;/a&gt; for more information.&lt;/p&gt;
+
+&lt;h2 id=&quot;remote-profile-and-test-on-iphoneipad&quot;&gt;Remote Profile and Test on iPhone/iPad&lt;/h2&gt;
+
+&lt;p&gt;Besides the Android RPC application, we also provide an &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/ios_rpc&quot;&gt;iOS RPC app&lt;/a&gt;, through which we can easily profile and test TVM computation workloads on iPhone or iPad. It works almost the same as that on Android, while XCode and an iOS device are required.&lt;/p&gt;
+</description>
+                <link>https://tvm.apache.org/2017/11/08/android-rpc-introduction</link>
+                <guid>https://tvm.apache.org/2017/11/08/android-rpc-introduction</guid>
+                <pubDate>Wed, 08 Nov 2017 00:00:00 -0800</pubDate>
+        </item>
+
+        <item>
+                <title>Bringing AMDGPUs to TVM Stack and NNVM Compiler with ROCm</title>
+                <description>&lt;p style=&quot;text-align: center&quot;&gt;Aditya Atluri, Advanced Micro Devices, Inc.&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Masahiro Masuda, Ziosoft, Inc.&lt;/p&gt;
+
+&lt;p&gt;We are pleased to announce a new GPU backend for TVM stack - ROCm backend for AMD GPUs. If you are not familiar with TVM, you can refer to &lt;a href=&quot;http://tvmlang.org/2017/08/17/tvm-release-announcement.html&quot;&gt;the earlier announcement&lt;/a&gt; first. In short, TVM stack is an end to end compilation stack to deploy deep learning workloads to all hardware backends. Today’s announcement focuses on the code generator support for AMD GPUs. Specifically, we developed a [...]
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/tvm_rocm_overview.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM stack is developed by an open source community under Apache-2.0 License. ROCm backend support is done with the help from community. Aditya first implemented codegen and runtime. He was later joined by Masahiro. Masahiro’s full time job is not related to TVM or AMD GPUs. Nonetheless, TVM got him excited and he has been involved in fixing bugs, resolving all failing unittests, and adding math function support to codegen.&lt;/p&gt;
+
+&lt;h2 id=&quot;rocm-stack&quot;&gt;ROCm stack&lt;/h2&gt;
+
+&lt;p&gt;Radeon Open Compute is open-source initiative by AMD to leverage compute power of current and future generation GPUs. ROCm software stack is a great tool to express and run most commonly used GPU programming models and achieve peak performance. Not only ROCm is an open-source stack, it is an open stack, which means all the ISA and hardware features are well documented and programmable by developers. Developers can experiment with different programming models and try out multiple [...]
+
+&lt;p&gt;TVM leverages the open-source feature of ROCm stack by using LLVM AMDGPU backend code generator. TVM translates from its intermediate representation (IR) to LLVM intermediate representation. This is the place where ROCm stack open-source feature takes control. TVM’s LLVM AMDGPU CodeGen pass converts LLVM IR into GPU assembly and object code, which is later called to run the whole network or group of layers or single layer.&lt;/p&gt;
+
+&lt;p&gt;On ROCm stack, there is no virtual ISA, you get what you ask for not less not more. Hence, one can schedule operations in a kernel at a granularity of a single instruction, without worrying about instruction reordering and other optimizations you do not ask for.&lt;/p&gt;
+
+&lt;h2 id=&quot;using-nnvm-compiler-with-rocm-backend&quot;&gt;Using NNVM Compiler with ROCm backend&lt;/h2&gt;
+
+&lt;p&gt;Thanks to TVM stack, we can directly compile models from popular deep learning frameworks such as MXNet and PyTorch into AMD GPU assembly using NNVM compiler, today. With ROCm backend, the generic workflow becomes as follows.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/rocm_workflow.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;We have put together working examples of compiling models from MXNet and PyTorch with NNVM, and running them on AMD GPUs with ROCm backend. More frameworks are supported via the NNVM compiler stack. The repository is available &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;The script &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm/blob/master/mxnet_imagenet_inference.py&quot;&gt;mxnet_imagenet_inference.py&lt;/a&gt; demonstrates Imagenet inference on AMD GPUs with recently introduced MXNet-Gluon model. It does the following:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Loads Resnet 50 model from &lt;a href=&quot;https://mxnet.incubator.apache.org/versions/master/api/python/gluon/model_zoo.html&quot;&gt;the Gluon model zoo&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Converts Gluon Resnet 50 model to NNVM graph format, using &lt;code class=&quot;highlighter-rouge&quot;&gt;nnvm.frontend.from_mxnet (...)&lt;/code&gt;&lt;/li&gt;
+  &lt;li&gt;Compiles and executes the graph with ROCm backend&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;The example comes with an image of the following cat.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/cat.png&quot; alt=&quot;image&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Running our network, it predicts this image as “tigar cat”, among 1000 categories.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-plain&quot; data-lang=&quot;plain&quot;&gt;$ python mxnet_imagenet_inference.py
+Testing model resnet50_v1
+x (1, 3, 224, 224)
+TVM prediction top-1: 282 tiger cat&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;The script &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm/blob/master/advanced_superres_onnx.py&quot;&gt;advanced_superres_onnx.py&lt;/a&gt; gives an example of loading a model trained with PyTorch. The model is stored in the &lt;a href=&quot;https://onnx.ai/&quot;&gt;ONNX&lt;/a&gt; format. In this example, our network takes an low resolution image as input, and outputs a 4x high resolution image. We refer the details of a problem setup and the network archit [...]
+
+&lt;p&gt;In order to use models in the ONNX format with NNVM, we first use &lt;a href=&quot;https://github.com/onnx/onnx&quot;&gt;the ONNX library&lt;/a&gt; to load the ONNX model into the Protocol buffer object. We can then use &lt;code class=&quot;highlighter-rouge&quot;&gt;nnvm.frontend.from_onnx(...)&lt;/code&gt; to obtain an equivalent NNVM graph. With a NNVM graph in hand, we can follow the generic workflow of compilation and graph execution outlined above.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/butterfly.png&quot; alt=&quot;image&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The input to the network is a 64 x 64 image on the left, and it outputs a 256 x 256 image on the right. On the middle is a 256 x 256 image obtained simply by resizing the input image with bicubic interpolation. The network outputs an image of far better quality.&lt;/p&gt;
+
+&lt;p&gt;The input images are taken from the original paper, and they are available &lt;a href=&quot;https://twitter.app.box.com/s/lcue6vlrd01ljkdtdkhmfvk7vtjhetog&quot;&gt;here&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h2 id=&quot;a-note-on-performance&quot;&gt;A Note on performance&lt;/h2&gt;
+
+&lt;p&gt;The current support on ROCm focuses on the functionality coverage. We have already seen promising performance results by simply adopting existing TVM schedules for CUDA backend. For example, you can try running &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/topi/recipe/gemm/cuda_gemm_square.py&quot;&gt;the gemm test script&lt;/a&gt; in the TVM repository and see the result. For two types of cards we tested, the current gemm recipe for square matrix multiplication (not  [...]
+This is already a promising start, as it is very hard to optimize performance to get to peak and we
+did not yet apply AMD GPU specific optimizations.
+We are starting to look at performance optimization and we expect more improvement to come.&lt;/p&gt;
+
+&lt;h2 id=&quot;walkthrough-of-rocm-backend&quot;&gt;Walkthrough of ROCm backend&lt;/h2&gt;
+
+&lt;p&gt;In the following part of this article we focus on explaining how to use ROCm backend when working with TVM directly. All you need to do is to build your TVM function under the target “rocm” and create a runtime context for it. Here, we show an example of ROCm backend usage, following ‘Vector Add Example’ in TVM’s &lt;a href=&quot;http://docs.tvmlang.org/tutorials/get_started.html#vector-add-example&quot;&gt;getting started tutorial&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;We start by setting up a compute operation and a schedule for the vector add kernel. This step is independent of a backend.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;__future__&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;absolute_import&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;print_function&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm&lt;/span&gt;
+&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;numpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;n&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&q [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p& [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&qu [...]
+
+&lt;p&gt;Next, to use ROCm backend we build our kernel under “rocm” target. This will cause TVM to use our new code generator. We also need a runtime context for ROCm backend.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;rocm&quot;&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;After building the kernel and setting up a runtime context, we can launch our vector add kernel.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+
+&lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt; &l [...]
+
+&lt;p&gt;We can view LLVM IR that TVM generates in the following way:&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;llvm&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;You should see something like this:&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-llvm&quot; data-lang=&quot;llvm&quot;&gt;&lt;span class=&quot;c1&quot;&gt;; ModuleID = 'myadd__kernel0'&lt;/span&gt;
+&lt;span class=&quot;err&quot;&gt;sour&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;datalayout&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64&quot;&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;triple&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;amdgcn-amd-amdhsa-hcc&quot;&lt;/span&gt;
+
+
+&lt;span class=&quot;c1&quot;&gt;; Function Attrs: nounwind&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;err&quot;&gt;amdgpu_ker&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;ne&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;l&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k [...]
+&lt;span class=&quot;nl&quot;&gt;entry:&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workgroup.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workitem.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%6&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;-127&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%7&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;ashr&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%6&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;6&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%8&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;icmp&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;slt&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%7&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i1&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%8&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_then&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_else&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_then:&lt;/span&gt;                                          &lt;span class=&quot;c1&quot;&gt;; preds = %entry&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%9&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;shl&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;6&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end.sink.split&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_end.sink.split:&lt;/span&gt;                                &lt;span class=&quot;c1&quot;&gt;; preds = %if_else, %if_then&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;phi&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%21&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_else&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span [...]
+  &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%12&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%13&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%15&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;fadd&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%18&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%19&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;k&quot;&gt;store&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt; [...]
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_end:&lt;/span&gt;                                           &lt;span class=&quot;c1&quot;&gt;; preds = %if_end.sink.split, %if_else&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;ret&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt;
+
+
+&lt;span class=&quot;nl&quot;&gt;if_else:&lt;/span&gt;                                          &lt;span class=&quot;c1&quot;&gt;; preds = %entry&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%20&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sub&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%21&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;shl&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;m&quot;&gt;6&lt;/span&gt;
+  &lt;span class=&quot;nv&quot;&gt;%22&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;icmp&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;slt&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%21&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%20&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i1&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%22&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end.sink.split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;We can also view GPU assembly that ROCm backend generates. This is the real code that runs on your GPU.&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;asm&quot;&lt;/span&gt;&lt;sp [...]
+
+&lt;p&gt;The assembly should look something like this, omitting unnecessary details:&lt;/p&gt;
+
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-plain&quot; data-lang=&quot;plain&quot;&gt;        s_load_dword s1, s[4:5], 0x18
+        v_mov_b32_e32 v2, -1
+        v_mov_b32_e32 v1, 0
+        s_waitcnt lgkmcnt(0)
+        s_add_i32 s0, s1, 0xffffff81
+        s_ashr_i32 s0, s0, 6
+        s_cmp_ge_i32 s6, s0
+        s_cbranch_scc0 BB0_2
+        v_sub_i32_e32 v1, vcc, s1, v0
+        s_lshl_b32 s0, s6, 6
+        v_cmp_lt_i32_e32 vcc, s0, v1
+        v_mov_b32_e32 v2, 0
+        v_cndmask_b32_e64 v1, 0, -1, vcc
+BB0_2:
+        v_cmp_ne_u32_e32 vcc, 0, v2
+        v_cndmask_b32_e64 v2, 0, 1, vcc
+        v_cmp_ne_u32_e32 vcc, 1, v2
+        s_and_b64 vcc, exec, vcc
+        s_cbranch_vccnz BB0_4
+        s_lshl_b32 s0, s6, 6
+        v_mov_b32_e32 v1, -1
+BB0_4:
+        v_cmp_ne_u32_e32 vcc, 0, v1
+        v_mov_b32_e32 v1, s0
+        s_and_saveexec_b64 s[0:1], vcc
+        s_xor_b64 s[0:1], exec, s[0:1]
+        s_cbranch_execz BB0_6
+BB0_5:
+        s_load_dwordx2 s[2:3], s[4:5], 0x0
+        s_load_dwordx2 s[6:7], s[4:5], 0x8
+        v_add_i32_e32 v0, vcc, v1, v0
+        s_load_dwordx2 s[4:5], s[4:5], 0x10
+        v_ashrrev_i32_e32 v1, 31, v0
+        v_lshlrev_b64 v[0:1], 2, v[0:1]
+        s_waitcnt lgkmcnt(0)
+        v_add_i32_e32 v2, vcc, s4, v0
+        v_mov_b32_e32 v3, s5
+        v_addc_u32_e32 v3, vcc, v3, v1, vcc
+        flat_load_dword v2, v[2:3]
+        v_add_i32_e32 v4, vcc, s6, v0
+        v_mov_b32_e32 v3, s7
+        v_addc_u32_e32 v5, vcc, v3, v1, vcc
+        flat_load_dword v4, v[4:5]
+        v_mov_b32_e32 v3, s3
+        v_add_i32_e32 v0, vcc, s2, v0
+        v_addc_u32_e32 v1, vcc, v3, v1, vcc
+        s_waitcnt vmcnt(0) lgkmcnt(0)
+        v_add_f32_e32 v2, v2, v4
+        flat_store_dword v[0:1], v2
+BB0_6:
+        s_or_b64 exec, exec, s[0:1]
+        s_endpgm&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+
+&lt;p&gt;Links&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Github page of NNVM Compiler: &lt;a href=&quot;https://github.com/dmlc/nnvm&quot;&gt;https://github.com/dmlc/nnvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Github page of TVM: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Examples of ROCm backend with NNVM: &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm&quot;&gt;https://github.com/ROCmSoftwarePlatform/nnvm-rocm&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</description>
+                <link>https://tvm.apache.org/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm</link>
+                <guid>https://tvm.apache.org/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm</guid>
+                <pubDate>Mon, 30 Oct 2017 00:00:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>NNVM Compiler: Open Compiler for AI Frameworks</title>
+                <description>&lt;p style=&quot;text-align: center&quot;&gt;Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;Amazon Web Service AI team&lt;/p&gt;
+&lt;p style=&quot;text-align: center&quot;&gt;DMLC open-source community&lt;/p&gt;
+
+&lt;p&gt;Deep learning has become ubiquitous and indispensable. We are seeing a rising need for deploying deep learning workloads on many kinds of platforms such as mobile phones, GPU, IoT devices and specialized accelerators.  Last month, we announced TVM stack to close the gap between deep learning frameworks, and the performance- or efficiency-oriented hardware backends.  TVM stack makes it easy to build an end to end compilation for a deep learning framework.  However, we think it wo [...]
+
+&lt;p&gt;Today, UW Allen school and AWS AI team, together with other contributors, are excited to announce the release of NNVM compiler, an open deep learning compiler to compile front-end framework workloads directly to hardware backends. We build it using the two-level intermediate representation(IR) in the TVM stack.
+The reader is welcome to refer to the &lt;a href=&quot;http://www.tvmlang.org/2017/08/17/tvm-release-announcement.html&quot;&gt;original TVM announcement&lt;/a&gt; for more technical details about TVM stack. With the help of TVM stack, NNVM compiler can:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Represent and optimize the common deep learning workloads in high level graph IR&lt;/li&gt;
+  &lt;li&gt;Transform the computation graph to minimize memory utilization, optimize data layout and fuse computation patterns for different hardware backends.&lt;/li&gt;
+  &lt;li&gt;Present an end to end compilation pipeline from front-end deep learning frameworks to bare metal hardwares.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_compiler_stack.png&quot; alt=&quot;image&quot; width=&quot;612px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;The NNVM compiler can directly take models from deep learning frameworks such as Apache MXNet.
+It also support model exchange formats such as ONNX and CoreML. ONNX support enables NNVM to compile deep learning models from PyTorch, Caffe2 and CNTK.
+The CoreML frontend enables deployment of CoreML models to non-iOS devices.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_compiler_code.png&quot; alt=&quot;image&quot; width=&quot;712px&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;separation-of-optimization-and-deployment&quot;&gt;Separation of Optimization and Deployment&lt;/h2&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_deploy.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;NNVM compiler applies graph level and tensor level optimizations and jointly optimize them to get the best performance. We take a different approach from existing deep learning frameworks, which packages the graph optimization with the deployment runtime.  NNVM compiler adopts the conventional wisdom from compiler to separate the optimization from the actual deployment runtime. This approach offers substantial optimization but still keeps the runtime lightweight. The compiled mo [...]
+
+&lt;h2 id=&quot;performance&quot;&gt;Performance&lt;/h2&gt;
+
+&lt;p&gt;NNVM compiler is still under active development, and we can expect more improvements to come, but we have started to see promising results.
+We benchmarked its performance and compared it against Apache MXNet on two typical hardware configurations: ARM CPU on Raspberry PI and Nvidia GPU on AWS. Despite the radical architecture difference between these two chips, we can use the same infrastructure and only need to change the schedule for each type of hardware.&lt;/p&gt;
+
+&lt;h3 id=&quot;nvidia-gpu&quot;&gt;Nvidia GPU&lt;/h3&gt;
+
+&lt;p&gt;GPU benchmarks and schedules are contributed by Leyuan Wang (AWS/UCDavis) and Yuwei Hu (TuSimple). We compared the NNVM compiler against Apache MXNet with CUDA8 and cuDNN7 as the backend on Nvidia K80. This is a very strong baseline, as Apache MXNet turns on auto-tuning to select the best kernel from CuDNN. We also used the optimized depthwise kernel in MXNet to optimize MobileNet workload.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_k80_result.png&quot; alt=&quot;image&quot; width=&quot;400px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;As can be seen, NNVM compiler generate code that outperforms Apache MXNet on K80. These improvements are due to the joint graph level and kernel level optimizations. It is worth noting that NNVM compiler generates all the optimized GPU kernels on its own without relying on external libraries like CuDNN.&lt;/p&gt;
+
+&lt;h3 id=&quot;raspberry-pi-3b&quot;&gt;Raspberry Pi 3b&lt;/h3&gt;
+
+&lt;p&gt;The Rasberry Pi compilation stack is contributed by Ziheng Jiang(AWS/FDU).
+We compared NNVM compiler against Apache MXNet with OpenBLAS and NNPack.
+We explored the setups to get the best performance out of MXNet: we turned on Winograd convolution in the NNPACK for 3x3 convolutions, enabled multi-threading and disabled the additional scheduler thread (so all threads are used by NNPack).&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/nnvm/nnvm_rasp_result.png&quot; alt=&quot;image&quot; width=&quot;400px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;As can be seen, the code generated by NNVM compiler is two times faster on ResNet18.
+The gap on MobileNet is mainly due to lack of depthwise convolution in existing CPU DNN libraries. NNVM compiler takes benefit of direct generating efficient ARM code directly.&lt;/p&gt;
+
+&lt;h2 id=&quot;acknowledgement&quot;&gt;Acknowledgement&lt;/h2&gt;
+&lt;p&gt;This project wouldn’t become possible without our early contributors in the DMLC community.
+We would like to specially thank Yuwei Hu(TuSimple), Leyuan Wang(AWS/UCDavis), Joshua Z. Zhang(AWS)
+and Xingjian Shi(HKUST) for their early contributions to the project. We would also like to thank all the contributors
+to the TVM stack.&lt;/p&gt;
+
+&lt;p&gt;We also learnt a lot from the following projects when building NNVM Compiler.&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/Theano/Theano&quot;&gt;Theano&lt;/a&gt;: possibly the earliest compiler for deep learning&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/halide/Halide&quot;&gt;Halide&lt;/a&gt;: TVM uses &lt;a href=&quot;https://github.com/dmlc/HalideIR&quot;&gt;HalideIR&lt;/a&gt; as data structure for
+arithematic simplification and low level lowering. HalideIR is derived from Halide.
+We also learns from Halide when implementing the lowering pipeline in TVM.&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://github.com/inducer/loopy&quot;&gt;Loopy&lt;/a&gt;: use of integer set analysis and its loop transformation primitives.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h2 id=&quot;links&quot;&gt;Links&lt;/h2&gt;
+&lt;ul&gt;
+  &lt;li&gt;Github page of NNVM Compiler: &lt;a href=&quot;https://github.com/dmlc/nnvm&quot;&gt;https://github.com/dmlc/nnvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;Github page of TVM: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://news.cs.washington.edu/2017/10/06/allen-school-and-aws-team-up-on-new-nnvm-compiler-for-deep-learning-frameworks/&quot;&gt;UW Allen school blog about NNVM compiler&lt;/a&gt;&lt;/li&gt;
+  &lt;li&gt;&lt;a href=&quot;https://aws.amazon.com/blogs/ai/introducing-nnvm-compiler-a-new-open-end-to-end-compiler-for-ai-frameworks/&quot;&gt;AWS blogpost about NNVM compiler&lt;/a&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</description>
+                <link>https://tvm.apache.org/2017/10/06/nnvm-compiler-announcement</link>
+                <guid>https://tvm.apache.org/2017/10/06/nnvm-compiler-announcement</guid>
+                <pubDate>Fri, 06 Oct 2017 08:30:00 -0700</pubDate>
+        </item>
+
+        <item>
+                <title>Optimize Deep Learning GPU Operators with TVM: A Depthwise Convolution Example</title>
+                <description>&lt;p&gt;Efficient deep learning operators are at the core of deep learning systems.
+Usually these operators are hard to optimize and require great efforts of HPC experts.
+&lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;TVM&lt;/a&gt;, an end to end tensor IR/DSL stack, makes this much easier.&lt;/p&gt;
+
+&lt;p&gt;This blog teaches you how to write high-performance GPU operator kernels with the help of TVM.
+We use depthwise convolution (i.e. &lt;a href=&quot;http://docs.tvmlang.org/api/python/topi.html#topi.nn.depthwise_conv2d_nchw&quot;&gt;topi.nn.depthwise_conv2d_nchw&lt;/a&gt;) as an example,
+and demonstrate how we can improve over the already hand optimized CUDA kernel in tensorflow.
+Our final version is 2x-4x faster than the optimized kernel in tf-1.2 under different workloads, and 3x-7x faster with operator fusion enabled.
+Below is the result tested on GTX1080, with filter size = [1, 256, 3, 3], stride = [1, 1], padding = ‘SAME’:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/tf_compare.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;h2 id=&quot;introduction-to-depthwise-convolution&quot;&gt;Introduction to Depthwise Convolution&lt;/h2&gt;
+
+&lt;p&gt;Depthwise convolution is an important building block of modern architectures, such as Xception [1] and MobileNet [2].
+It’s an effective method to reduce the computation complexity of deep neural networks.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/conv_and_depthconv.png&quot; alt=&quot;image&quot; width=&quot;80%&quot; /&gt;&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;source: &lt;a href=&quot;http://machinethink.net/blog/googles-mobile-net-architecture-on-iphone/&quot;&gt;http://machinethink.net/blog/googles-mobile-net-architecture-on-iphone/&lt;/a&gt;&lt;/p&gt;
+
+&lt;p&gt;In TVM, depthwise convolution can be declared as:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# padding stage
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;height_after_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;width_after_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+        &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;all&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt;  [...]
+        &lt;span class=&quot;n&quot;&gt;Input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span  [...]
+    &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;PaddedInput&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;# depthconv stage
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),& [...]
+&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; & [...]
+&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_channel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+        &lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;channel_multiplier&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/spa [...]
+        &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'DepthwiseConv2d'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h2 id=&quot;general-gpu-optimization-guidelines&quot;&gt;General GPU Optimization Guidelines&lt;/h2&gt;
+
+&lt;p&gt;This part briefly talks about three concepts we should know when optimizing CUDA code: data reuse, shared memory and bank conflicts.
+It would be great if you already know them, then you may skip this part.&lt;/p&gt;
+
+&lt;h3 id=&quot;data-reuse&quot;&gt;Data Reuse&lt;/h3&gt;
+&lt;p&gt;In modern computing architectures, the cost of loading data from memory is much higher than doing a single floating point computation [3].
+Because of that, we always want to reuse the input data after they are loaded into registers or shared memory (cache).&lt;/p&gt;
+
+&lt;p&gt;There are two forms of data reuse in depthwise convolution: filter reuse and input reuse. Filter reuse happens as the filter slides over the input channel and computes multiple times.
+Input reuse is realized through tiling, let’s take 3x3 depthwise conv as an example:&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/no_tiling.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Without tiling, each thread computes 1 output element and loads 3x3 input data. 16 threads together have 9x16 loads.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/tiling.png&quot; alt=&quot;image&quot; width=&quot;70%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;With tiling, each thread computes 2x2 output elements and loads 4x4 input data. 4 threads together have 16x4 loads.&lt;/p&gt;
+
+&lt;h3 id=&quot;shared-memory-and-bank-conflicts&quot;&gt;Shared Memory and Bank Conflicts&lt;/h3&gt;
+&lt;p&gt;Shared memory can be seen as cache in GPU. It is on-chip and much faster than global memory.&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/GPU_memory_hierarchy.png&quot; alt=&quot;image&quot; width=&quot;256px&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Shared memory is allocated per block. It’s common practice to load data from global memory into shared memory, and then all threads in the block read data from shared memory.&lt;/p&gt;
+
+&lt;p&gt;The size of shared memory is limited (usually 48K), so we must be cautious of shared memory overflow.
+Besides, too much shared memory allocated to one block limits the number of active blocks per multiprocessor.&lt;/p&gt;
+
+&lt;p&gt;Another performance issue with shared memory is bank conflicts. Shared memory is divided into equally sized memory modules (banks) that can be accessed simultaneously,
+however, if multiple threads access the same memory bank (causing bank conflicts), the accesses will be serialized, thus decreasing the effective bandwidth.&lt;/p&gt;
+
+&lt;p&gt;Shared memory banks are organized such that successive addresses are assigned to successive banks.
+To avoid bank conflicts, it’s better that successive threads access successive memory addresses, as illustrated below (each color represents one shared memory bank):&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/bank_conflicts.png&quot; alt=&quot;image&quot; width=&quot;95%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;For more details on shared memory and bank conflicts, please refer to &lt;a href=&quot;https://devblogs.nvidia.com/parallelforall/using-shared-memory-cuda-cc/&quot;&gt;this Nvidia’s blog&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;Ok, now let’s start optimizing depthwise convolution in TVM.&lt;/p&gt;
+
+&lt;h2 id=&quot;schedule-optimization&quot;&gt;Schedule Optimization&lt;/h2&gt;
+
+&lt;h3 id=&quot;compute-paddedinput-inline-to-save-memory-allocation&quot;&gt;Compute PaddedInput Inline to Save Memory Allocation&lt;/h3&gt;
+&lt;p&gt;As we see from part 1, padding is declared explicitly as a separate stage. We compute it inline to avoid redundant memory allocation:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;h3 id=&quot;divide-one-large-channel-into-smaller-blocks&quot;&gt;Divide One Large Channel into Smaller Blocks&lt;/h3&gt;
+&lt;p&gt;One straightforward schedule for depthwise convolution is that one cuda block takes care of one input channel and corresponding filters, loading them into shared memory and then computing:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;IS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/spa [...]
+&lt;span class=&quot;n&quot;&gt;FS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;shared&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span& [...]
+&lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.y&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;# bind the dimension of batch (N in NCHW) with block_y
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;c1&quot;&gt;# bind the dimension of channel (C in NCHW) with block_x
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;s [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;We test the average time cost of 1000 runs on GTX 1080, and compare with &lt;a href=&quot;https://www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution#depthwise_conv2d&quot;&gt;depthwise_conv2d in tensorflow&lt;/a&gt;.
+Here is the result:&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Filter&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;stride&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;tf-1.2 SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 21, 21]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;16.1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;9.1&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;34.8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;14.5&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 64, 64]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;130.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;98.9&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;251.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;387.4&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;As we can see, this schedule performs well with small channel size like 21 x 21 or 32 x 32, however, its performance drops seriously as the channel size increases to larger than 64 x 64.
+One main reason is that too much shared memory allocated to one block limits the number of active blocks per multiprocessor.&lt;/p&gt;
+
+&lt;p&gt;We modify the schedule to divide one large channel into smaller blocks. For example, one channel (64 x 64 or 96 x 96) is divided into blocks of 32 x 32,
+and one cuda block takes care of one 32 x 32 block:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;blocking_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;blocking_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of height (H in NCHW)
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of width (W in NCHW)
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;s [...]
+&lt;span class=&quot;c1&quot;&gt;# assign one 32 x 32 block to one cuda block
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt;&lt;span class=& [...]
+&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=& [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Here is the new result:&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;[blocking_h, blocking_w]&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;tf-1.2 SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 64, 64]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;130.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;63.4&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;251.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;132.5&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Our blocking strategy works! For 64 x 64 channel size, it brings 1.6x acceleration (98.9us -&amp;gt; 63.4us); for 96 x 96 channel size, it brings 2.9x acceleration (387.4us -&amp;gt; 132.5us).&lt;/p&gt;
+
+&lt;h3 id=&quot;tuning-parameters-of-thread-numbers&quot;&gt;Tuning Parameters of Thread Numbers&lt;/h3&gt;
+
+&lt;p&gt;How to schedule the workload, say, 32x32 among the threads of one cuda block? Intuitively, it should be like this:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class= [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;There are two parameters in the schedule: &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt;. How to determine the optimal combination of them? 
+Well, let’s first do some experiments. Below is the result with Filter = [256, 1, 3, 3] and stride = [1, 1]:&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Case&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_thread_y&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_thread_x&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;9.7&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;2&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8.8&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;3&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;17.7&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 32, 32]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;32.5&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Many interesting observations from above results:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;
+    &lt;p&gt;Case 2 is faster than case 1. In case 2, each thread computes a 8x1 tile in output, which corresponds to a 10x3 tile in input.
+It has better data reuse than case 1’s 4x1 tile.&lt;/p&gt;
+  &lt;/li&gt;
+  &lt;li&gt;
+    &lt;p&gt;Case 3 is slower than case 2. It’s because in case 3, the workload per thread is too large and leads to much cost of local memory read.&lt;/p&gt;
+  &lt;/li&gt;
+  &lt;li&gt;
+    &lt;p&gt;Case 4 is slower than case 3. It’s because &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x = 32&lt;/code&gt; ensures no bank conflicts, while &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y = 32&lt;/code&gt; doesn’t.&lt;/p&gt;
+  &lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;To summarize what we learn from above observations:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Large tile is good for data reuse, but not good for local memory read.&lt;/li&gt;
+  &lt;li&gt;The influence of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; on bank conflicts is asymmetric.&lt;/li&gt;
+  &lt;li&gt;To find the optimal combination of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is to achieve a balance of efficient shared memory access (avoid bank conflicts), data reuse, and local memory read.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;Pretty tricky. So, what exactly should we do to find the optimal combination? The answer is brute force search. 
+We can pass &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; as arguments to the schedule function, and try all possible combinations to find the optimal one. This can be done easily in TVM:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot; [...]
+    &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;do_schedule_as_usual&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inf&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;all_possible_combinations&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;test_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;optimal_combination&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;In fact, it can be seen as a simple auto scheduler.&lt;/p&gt;
+
+&lt;h3 id=&quot;vthread-and-strided-patterns&quot;&gt;Vthread and Strided Patterns&lt;/h3&gt;
+&lt;p&gt;Vthread (virtual thread) in TVM is introduced to support strided patterns. We can use it this way:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;num_vthread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_vthread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of height (H in NCHW) twice
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vyi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;c1&quot;&gt;# split the dimension of width (W in NCHW) twice
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vxi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;c1&quot;&gt;# bind thread and vthread respectively
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt; [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt;&lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=& [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Let’s print the IR to see what vthread does:&lt;/p&gt;
+
+&lt;div class=&quot;language-c++ highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/* Input = [1, 1, 32, 32], Filter = [1, 1, 3, 3], stride = [1, 1], padding = 'SAME' */&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.y, Range(min=0, extent=8), threadIdx.y)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.x, Range(min=0, extent=8), threadIdx.x)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)& [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;) [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;) [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;) [...]
+      &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[(((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;& [...]
+        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+      &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Without vthread (just set to 1), the IR is:&lt;/p&gt;
+
+&lt;div class=&quot;language-c++ highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/* Input = [1, 1, 32, 32], Filter = [1, 1, 3, 3], stride = [1, 1], padding = 'SAME' */&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.y, Range(min=0, extent=8), threadIdx.y)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.x, Range(min=0, extent=8), threadIdx.x)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+      &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)& [...]
+      &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+      &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;As we can see, when &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_y = 2&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_x = 2&lt;/code&gt;, the 32 x 32 channel is divided into four sub-channels of 16 x 16.
+Each thread computes four output elements at a time, one element in one sub-channel.&lt;/p&gt;
+
+&lt;p&gt;Below is the result with Filter = [256, 1, 3, 3], stride = [1, 1], blocking_h = 32, blocking_w = 32:&lt;/p&gt;
+
+&lt;style&gt;
+table th:nth-of-type(1) {
+    width: 120px;
+}
+table th:nth-of-type(2) {
+    width: 120px;
+}
+&lt;/style&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Case&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_thread_y, num_thread_x&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;num_vthread_y, num_vthread_x&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8, 8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;132.5&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;2&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8, 8&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;103.1&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;3&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4, 32&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 1&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;95.9&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;8, 16&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1, 2&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;90.9&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;p&gt;Case 2 is faster than case 1. It’s because in case 2 &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x=8&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_x=4&lt;/code&gt; together ensures that consecutive threads access consecutive memory addresses,
+thus avoiding bank conflicts, as illustrated below (each color represents one thread’s workload):&lt;/p&gt;
+
+&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/vthread_and_strided_pattern.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;In theory case 3 and 4 should be the same fast, since they have the same workload per thread, and both enjoy efficient shared memory access. Somehow case 4 is just a little faster.&lt;/p&gt;
+
+&lt;p&gt;Still remember tensorflow’s speed? It’s 251.6us, and now TVM is 2.8x faster. 387.4 -&amp;gt; 132.5 -&amp;gt; 95.9 -&amp;gt; 90.9, blocking helps the most; tuning thread numbers saves 37us;
+vthread saves additional 5us.&lt;/p&gt;
+
+&lt;p&gt;In fact, TVM can be extremely faster than tensorflow with large kernel size or channel_multiplier (because of more filter reuse) :&lt;/p&gt;
+
+&lt;table&gt;
+  &lt;thead&gt;
+    &lt;tr&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Input&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;Filter&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;stride&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;tf-1.2 SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;TVM SAME pad (us)&lt;/th&gt;
+      &lt;th style=&quot;text-align: center&quot;&gt;How faster is TVM&lt;/th&gt;
+    &lt;/tr&gt;
+  &lt;/thead&gt;
+  &lt;tbody&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;251.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;90.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;2.8x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 1, 5, 5]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;597.6&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;128.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4.6x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 2, 3, 3]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;659.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;143.7&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;4.6x&lt;/td&gt;
+    &lt;/tr&gt;
+    &lt;tr&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 256, 96, 96]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[256, 2, 5, 5]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;[1, 1]&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;1203.9&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;170.5&lt;/td&gt;
+      &lt;td style=&quot;text-align: center&quot;&gt;7.1x&lt;/td&gt;
+    &lt;/tr&gt;
+  &lt;/tbody&gt;
+&lt;/table&gt;
+
+&lt;h2 id=&quot;operator-fusion&quot;&gt;Operator Fusion&lt;/h2&gt;
+
+&lt;p&gt;One typical optimization we can do in deep learning is operator fusion, that computes multiple operators together in a single kernel without saving intermediate results back to global memory.
+TVM supports that out of the box.&lt;/p&gt;
+
+&lt;p&gt;Consider a common pattern in neural networks: &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;relu&lt;/code&gt;. We can fuse the three operators into one, by slightly modifying the original schedule:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;depthwise_c [...]
+&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;scale_shift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/s [...]
+&lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# is no longer DepthwiseConv2d
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# this line fuses ScaleShift, explicitly
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_scope&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;local&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;c1&qu [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# schedule for Output the same way we schedule for DepthwiseConv2d as discussed above
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_at&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt; [...]
+&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;It generates IR like this:&lt;/p&gt;
+
+&lt;div class=&quot;language-c++ highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/* Input = [1, 1, 32, 32], Filter = [1, 1, 3, 3], stride = [1, 1], padding = 'SAME' */&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [DepthwiseConv2d] storage_scope = &quot;local&quot;&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;allocate&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/s [...]
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.y, Range(min=0, extent=8), threadIdx.y)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;c1&quot;&gt;// attr [iter_var(threadIdx.x, Range(min=0, extent=8), threadIdx.x)] thread_extent = 8&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;produce&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+      &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &l [...]
+        &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+          &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt [...]
+          &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+      &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inner&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
+      &lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[((((((((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blockIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt [...]
+    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+  &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
... 361 lines suppressed ...