You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2020/07/14 16:04:47 UTC

[incubator-tvm-site] branch asf-site updated: Build at Tue Jul 14 09:04:36 PDT 2020

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 33531cc  Build at Tue Jul 14 09:04:36 PDT 2020
33531cc is described below

commit 33531cc08172d9e39618ec08d2d52508d6fbb6e8
Author: tqchen <tq...@octoml.ai>
AuthorDate: Tue Jul 14 09:04:36 2020 -0700

    Build at Tue Jul 14 09:04:36 PDT 2020
---
 ...s-with-TVM-A-Depthwise-Convolution-Example.html |  128 +-
 ...s-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html |   42 +-
 2017/11/08/android-rpc-introduction.html           |   84 +-
 2018/01/16/opt-mali-gpu.html                       |  124 +-
 2018/03/23/nmt-transformer-optimize.html           |   24 +-
 2018/08/10/DLPack-Bridge.html                      |   48 +-
 2018/10/03/auto-opt-all.html                       |    6 +-
 2018/12/18/lowprecision-conv.html                  |   20 +-
 2019/01/19/Golang.html                             |   16 +-
 2019/04/29/opt-cuda-quantized.html                 |   42 +-
 2019/05/30/pytorch-frontend.html                   |   14 +-
 2020/05/20/bring-your-own-datatypes.html           |   94 +-
 2020/06/04/tinyml-how-tvm-is-taming-tiny.html      |   66 +-
 2020/07/14/bert-pytorch-tvm.html                   |  719 ++++
 atom.xml                                           | 1361 ++++---
 blog.html                                          |   10 +
 download.html                                      |    2 +-
 images/bert-pytorch/bert-tvm_49_0.svg              |  691 ++++
 images/bert-pytorch/bert-tvm_54_0.svg              |  691 ++++
 images/bert-pytorch/bert-tvm_65_2.svg              |  667 ++++
 images/bert-pytorch/bert-tvm_68_0.svg              |  667 ++++
 images/bert-pytorch/bert-tvm_70_0.svg              |  667 ++++
 images/bert-pytorch/bert-tvm_72_0.svg              |  559 +++
 images/bert-pytorch/bert-tvm_74_0.svg              |  547 +++
 images/bert-pytorch/bert_layer.svg                 |  234 ++
 images/bert-pytorch/bert_model.svg                 |  325 ++
 images/bert-pytorch/pytorch-tvm-training_20_0.svg  | 1237 ++++++
 images/bert-pytorch/pytorch-tvm-training_25_0.svg  | 1537 ++++++++
 images/bert-pytorch/pytorch-tvm-training_31_0.svg  | 4015 ++++++++++++++++++++
 images/bert-pytorch/pytorch-tvm-training_34_0.svg  | 4015 ++++++++++++++++++++
 images/bert-pytorch/pytorch-tvm-training_40_0.svg  | 1651 ++++++++
 rss.xml                                            | 1363 ++++---
 sitemap.txt                                        |    1 +
 tvm                                                |    1 -
 34 files changed, 20345 insertions(+), 1323 deletions(-)

diff --git a/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html b/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html
index 3cc820b..0848819 100644
--- a/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html
+++ b/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html
@@ -186,18 +186,18 @@ It’s an effective method to reduce the computation complexity of deep neural n
 <p>In TVM, depthwise convolution can be declared as:</p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1"># padding stage
-</span><span class="n">PaddedInput</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span>
+</span><span class="n">PaddedInput</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span>
     <span class="p">(</span><span class="n">batch</span><span class="p">,</span> <span class="n">in_channel</span><span class="p">,</span> <span class="n">height_after_pad</span><span class="p">,</span> <span class="n">width_after_pad</span><span class="p">),</span>
-    <span class="k">lambda</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">tvm</span><span class="p">.</span><span class="n">select</span><span class="p">(</span>
-        <span class="n">tvm</span><span class="p">.</span><span class="nb">all</span><span class="p">(</span><span class="n">i</span> <span class="o">&gt;=</span> <span class="n">pad_top</span><span class="p">,</span> <span class="n">i</span> <span class="o">-</span> <span class="n">pad_top</span> <span class="o">&lt;</span> <span class="n">in_height</span><span class="p">,</span> <span class="n">j</span> <span class="o">&gt;=</span> <span class="n">pad_left</span><span class="p">,</span [...]
-        <span class="n">Input</span><span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i</span> <span class="o">-</span> <span class="n">pad_top</span><span class="p">,</span> <span class="n">j</span> <span class="o">-</span> <span class="n">pad_left</span><span class="p">],</span> <span class="n">tvm</span><span class="p">.</span><span class="n">const</span><span class="p">(</span><span class="mf">0.0 [...]
+    <span class="k">lambda</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">tvm</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
+        <span class="n">tvm</span><span class="o">.</span><span class="nb">all</span><span class="p">(</span><span class="n">i</span> <span class="o">&gt;=</span> <span class="n">pad_top</span><span class="p">,</span> <span class="n">i</span> <span class="o">-</span> <span class="n">pad_top</span> <span class="o">&lt;</span> <span class="n">in_height</span><span class="p">,</span> <span class="n">j</span> <span class="o">&gt;=</span> <span class="n">pad_left</span><span class="p">,</span [...]
+        <span class="n">Input</span><span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i</span> <span class="o">-</span> <span class="n">pad_top</span><span class="p">,</span> <span class="n">j</span> <span class="o">-</span> <span class="n">pad_left</span><span class="p">],</span> <span class="n">tvm</span><span class="o">.</span><span class="n">const</span><span class="p">(</span><span class="mf">0.0 [...]
     <span class="n">name</span><span class="o">=</span><span class="s">"PaddedInput"</span><span class="p">)</span>
 <span class="c1"># depthconv stage
-</span><span class="n">di</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">filter_height</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'di'</span><span class="p">)</span>
-<span class="n">dj</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">filter_width</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'dj'</span><span class="p">)</span>
-<span class="n">Output</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span>
+</span><span class="n">di</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">filter_height</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'di'</span><span class="p">)</span>
+<span class="n">dj</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">filter_width</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'dj'</span><span class="p">)</span>
+<span class="n">Output</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span>
     <span class="p">(</span><span class="n">batch</span><span class="p">,</span> <span class="n">out_channel</span><span class="p">,</span> <span class="n">out_height</span><span class="p">,</span> <span class="n">out_width</span><span class="p">),</span>
-    <span class="k">lambda</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">tvm</span><span class="p">.</span><span class="nb">sum</span><span class="p">(</span>
+    <span class="k">lambda</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">tvm</span><span class="o">.</span><span class="nb">sum</span><span class="p">(</span>
         <span class="n">PaddedInput</span><span class="p">[</span><span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="o">/</span><span class="n">channel_multiplier</span><span class="p">,</span> <span class="n">i</span><span class="o">*</span><span class="n">stride_h</span> <span class="o">+</span> <span class="n">di</span><span class="p">,</span> <span class="n">j</span><span class="o">*</span><span class="n">stride_w</span> <span class="o">+</span> <sp [...]
         <span class="n">axis</span><span class="o">=</span><span class="p">[</span><span class="n">di</span><span class="p">,</span> <span class="n">dj</span><span class="p">]),</span>
     <span class="n">name</span><span class="o">=</span><span class="s">'DepthwiseConv2d'</span><span class="p">)</span>
@@ -250,21 +250,21 @@ To avoid bank conflicts, it’s better that successive threads access successive
 <h3 id="compute-paddedinput-inline-to-save-memory-allocation">Compute PaddedInput Inline to Save Memory Allocation</h3>
 <p>As we see from part 1, padding is declared explicitly as a separate stage. We compute it inline to avoid redundant memory allocation:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">Output</span><span class="p">.</span><span class="n">op</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">PaddedInput</span><span class="p">].</span><span class="n">compute_inline</span><span class="p">()</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">Output</span><span class="o">.</span><span class="n">op</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">PaddedInput</span><span class="p">]</span><span class="o">.</span><span class="n">compute_inline</span><span class="p">()</span>
 </code></pre></div></div>
 
 <h3 id="divide-one-large-channel-into-smaller-blocks">Divide One Large Channel into Smaller Blocks</h3>
 <p>One straightforward schedule for depthwise convolution is that one cuda block takes care of one input channel and corresponding filters, loading them into shared memory and then computing:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">IS</span> <span class="o">=</span> <span class="n">s</span><span class="p">.</span><span class="n">cache_read</span><span class="p">(</span><span class="n">PaddedInput</span><span class="p">,</span> <span class="s">"shared"</span><span class="p">,</span> <span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">])</span>
-<span class="n">FS</span> <span class="o">=</span> <span class="n">s</span><span class="p">.</span><span class="n">cache_read</span><span class="p">(</span><span class="n">Filter</span><span class="p">,</span> <span class="s">"shared"</span><span class="p">,</span> <span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">])</span>
-<span class="n">block_y</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.y"</span><span class="p">)</span>
-<span class="n">block_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">IS</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">cache_read</span><span class="p">(</span><span class="n">PaddedInput</span><span class="p">,</span> <span class="s">"shared"</span><span class="p">,</span> <span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">])</span>
+<span class="n">FS</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">cache_read</span><span class="p">(</span><span class="n">Filter</span><span class="p">,</span> <span class="s">"shared"</span><span class="p">,</span> <span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">])</span>
+<span class="n">block_y</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.y"</span><span class="p">)</span>
+<span class="n">block_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">)</span>
 <span class="c1"># bind the dimension of batch (N in NCHW) with block_y
-</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">Output</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">block_y</span><span class="p">)</span>
+</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">Output</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">block_y</span><span class="p">)</span>
 <span class="c1"># bind the dimension of channel (C in NCHW) with block_x
-</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">Output</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">block_x</span><span class="p">)</span>
+</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">Output</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">block_x</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>We test the average time cost of 1000 runs on GTX 1080, and compare with <a href="https://www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution#depthwise_conv2d">depthwise_conv2d in tensorflow</a>.
@@ -321,14 +321,14 @@ and one cuda block takes care of one 32 x 32 block:</p>
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">blocking_h</span> <span class="o">=</span> <span class="mi">32</span>
 <span class="n">blocking_w</span> <span class="o">=</span> <span class="mi">32</span>
 <span class="c1"># split the dimension of height (H in NCHW)
-</span><span class="n">bx1</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">Output</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="n">factor</sp [...]
+</span><span class="n">bx1</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">Output</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> <sp [...]
 <span class="c1"># split the dimension of width (W in NCHW)
-</span><span class="n">bx2</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">Output</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">3</span><span class="p">],</span> <span class="n">factor</sp [...]
+</span><span class="n">bx2</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">Output</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">3</span><span class="p">],</span> <sp [...]
 <span class="c1"># assign one 32 x 32 block to one cuda block
-</span><span class="n">by</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">fuse</span><span class="p">(</span><span class="n">Output</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">Output</span><span class="p">.</span><span class="n">op</span [...]
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">block_y</span><span class="p">)</span>
-<span class="n">bx</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">fuse</span><span class="p">(</span><span class="n">bx1</span><span class="p">,</span> <span class="n">bx2</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">bx</span><span class="p">,</span> <span class="n">block_x</span><span class="p">)</span>
+</span><span class="n">by</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">fuse</span><span class="p">(</span><span class="n">Output</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">Output</span><span class="o">.</span>< [...]
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">block_y</span><span class="p">)</span>
+<span class="n">bx</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">fuse</span><span class="p">(</span><span class="n">bx1</span><span class="p">,</span> <span class="n">bx2</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">bx</span><span class="p">,</span> <span class="n">block_x</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>Here is the new result:</p>
@@ -366,16 +366,16 @@ and one cuda block takes care of one 32 x 32 block:</p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">num_thread_y</span> <span class="o">=</span> <span class="mi">8</span>
 <span class="n">num_thread_x</span> <span class="o">=</span> <span class="mi">8</span>
-<span class="n">thread_y</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_y</span><span class="p">),</span> <span class="s">"threadIdx.y"</span><span class="p">)</span>
-<span class="n">thread_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="p">),</span> <span class="s">"threadIdx.x"</span><span class="p">)</span>
-<span class="n">ty</span><span class="p">,</span> <span class="n">yi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">h_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_y</span><span class="p">)</span>
-<span class="n">tx</span><span class="p">,</span> <span class="n">xi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">w_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_x</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">reorder</span><span class="p">(</span><span class="n">ty</span><span class="p">,</span> <span class="n">tx</span><span class="p">,</span> <span class="n">yi</span><span class="p">,</span> <span class="n">xi</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">ty</span><span class="p">,</span> <span class="n">thread_y</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">tx</span><span class="p">,</span> <span class="n">thread_x</span><span class="p">)</span>
+<span class="n">thread_y</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_y</span><span class="p">),</span> <span class="s">"threadIdx.y"</span><span class="p">)</span>
+<span class="n">thread_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="p">),</span> <span class="s">"threadIdx.x"</span><span class="p">)</span>
+<span class="n">ty</span><span class="p">,</span> <span class="n">yi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">h_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_y</span><span class="p">)</span>
+<span class="n">tx</span><span class="p">,</span> <span class="n">xi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">w_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_x</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">reorder</span><span class="p">(</span><span class="n">ty</span><span class="p">,</span> <span class="n">tx</span><span class="p">,</span> <span class="n">yi</span><span class="p">,</span> <span class="n">xi</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">ty</span><span class="p">,</span> <span class="n">thread_y</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">tx</span><span class="p">,</span> <span class="n">thread_x</span><span class="p">)</span>
 </code></pre></div></div>
 
-<p>There are two parameters in the schedule: <code class="language-plaintext highlighter-rouge">num_thread_y</code> and <code class="language-plaintext highlighter-rouge">num_thread_x</code>. How to determine the optimal combination of them? 
+<p>There are two parameters in the schedule: <code class="highlighter-rouge">num_thread_y</code> and <code class="highlighter-rouge">num_thread_x</code>. How to determine the optimal combination of them? 
 Well, let’s first do some experiments. Below is the result with Filter = [256, 1, 3, 3] and stride = [1, 1]:</p>
 
 <table>
@@ -431,7 +431,7 @@ It has better data reuse than case 1’s 4x1 tile.</p>
     <p>Case 3 is slower than case 2. It’s because in case 3, the workload per thread is too large and leads to much cost of local memory read.</p>
   </li>
   <li>
-    <p>Case 4 is slower than case 3. It’s because <code class="language-plaintext highlighter-rouge">num_thread_x = 32</code> ensures no bank conflicts, while <code class="language-plaintext highlighter-rouge">num_thread_y = 32</code> doesn’t.</p>
+    <p>Case 4 is slower than case 3. It’s because <code class="highlighter-rouge">num_thread_x = 32</code> ensures no bank conflicts, while <code class="highlighter-rouge">num_thread_y = 32</code> doesn’t.</p>
   </li>
 </ul>
 
@@ -439,14 +439,14 @@ It has better data reuse than case 1’s 4x1 tile.</p>
 
 <ul>
   <li>Large tile is good for data reuse, but not good for local memory read.</li>
-  <li>The influence of <code class="language-plaintext highlighter-rouge">num_thread_y</code> and <code class="language-plaintext highlighter-rouge">num_thread_x</code> on bank conflicts is asymmetric.</li>
-  <li>To find the optimal combination of <code class="language-plaintext highlighter-rouge">num_thread_y</code> and <code class="language-plaintext highlighter-rouge">num_thread_x</code> is to achieve a balance of efficient shared memory access (avoid bank conflicts), data reuse, and local memory read.</li>
+  <li>The influence of <code class="highlighter-rouge">num_thread_y</code> and <code class="highlighter-rouge">num_thread_x</code> on bank conflicts is asymmetric.</li>
+  <li>To find the optimal combination of <code class="highlighter-rouge">num_thread_y</code> and <code class="highlighter-rouge">num_thread_x</code> is to achieve a balance of efficient shared memory access (avoid bank conflicts), data reuse, and local memory read.</li>
 </ul>
 
 <p>Pretty tricky. So, what exactly should we do to find the optimal combination? The answer is brute force search. 
-We can pass <code class="language-plaintext highlighter-rouge">num_thread_y</code> and <code class="language-plaintext highlighter-rouge">num_thread_x</code> as arguments to the schedule function, and try all possible combinations to find the optimal one. This can be done easily in TVM:</p>
+We can pass <code class="highlighter-rouge">num_thread_y</code> and <code class="highlighter-rouge">num_thread_x</code> as arguments to the schedule function, and try all possible combinations to find the optimal one. This can be done easily in TVM:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">def</span> <span class="nf">schedule_depthwise_conv2d</span><span class="p">(...,</span> <span class="n">num_thread_y</span><span class="o">=</span><span class="mi">8</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="o">=</span><span class="mi">8</span><span class="p">):</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">def</span> <span class="nf">schedule_depthwise_conv2d</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="n">num_thread_y</span><span class="o">=</span><span class="mi">8</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="o">=</span><span class="mi">8</span><span class="p">):</span>
     <span class="n">num_thread_y</span> <span class="o">=</span> <span class="n">num_thread_y</span>
     <span class="n">num_thread_x</span> <span class="o">=</span> <span class="n">num_thread_x</span>
     <span class="n">do_schedule_as_usual</span>
@@ -454,8 +454,8 @@ We can pass <code class="language-plaintext highlighter-rouge">num_thread_y</cod
 
 <span class="n">min_time_cost</span> <span class="o">=</span> <span class="n">inf</span>
 <span class="k">for</span> <span class="n">num_thread_y</span><span class="p">,</span> <span class="n">num_thread_x</span> <span class="ow">in</span> <span class="n">all_possible_combinations</span><span class="p">:</span>
-    <span class="n">schedule</span> <span class="o">=</span> <span class="n">schedule_depthwise_conv2d</span><span class="p">(...,</span> <span class="n">num_thread_y</span><span class="o">=</span><span class="n">num_thread_y</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="o">=</span><span class="n">num_thread_x</span><span class="p">)</span>
-    <span class="n">time_cost</span> <span class="o">=</span> <span class="n">test_depthwise_conv2d</span><span class="p">(...,</span> <span class="n">schedule</span><span class="p">)</span>
+    <span class="n">schedule</span> <span class="o">=</span> <span class="n">schedule_depthwise_conv2d</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="n">num_thread_y</span><span class="o">=</span><span class="n">num_thread_y</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="o">=</span><span class="n">num_thread_x</span><span class="p">)</span>
+    <span class="n">time_cost</span> <span class="o">=</span> <span class="n">test_depthwise_conv2d</span><span class="p">(</span><span class="o">...</span><span class="p">,</span> <span class="n">schedule</span><span class="p">)</span>
     <span class="k">if</span> <span class="n">time_cost</span> <span class="o">&lt;</span> <span class="n">min_time_cost</span><span class="p">:</span>
         <span class="n">min_time_cost</span> <span class="o">=</span> <span class="n">time_cost</span>
         <span class="n">optimal_combination</span> <span class="o">=</span> <span class="p">[</span><span class="n">num_thread_y</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="p">]</span>
@@ -470,22 +470,22 @@ We can pass <code class="language-plaintext highlighter-rouge">num_thread_y</cod
 <span class="n">num_vthread_x</span> <span class="o">=</span> <span class="mi">2</span>
 <span class="n">num_thread_y</span> <span class="o">=</span> <span class="mi">8</span>
 <span class="n">num_thread_x</span> <span class="o">=</span> <span class="mi">8</span>
-<span class="n">thread_vy</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_vthread_y</span><span class="p">),</span> <span class="s">"vthread"</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s">"vy"</span><span class="p">)</span>
-<span class="n">thread_vx</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_vthread_x</span><span class="p">),</span> <span class="s">"vthread"</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s">"vx"</span><span class="p">)</span>
-<span class="n">thread_y</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_y</span><span class="p">),</span> <span class="s">"threadIdx.y"</span><span class="p">)</span>
-<span class="n">thread_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="p">),</span> <span class="s">"threadIdx.x"</span><span class="p">)</span>
+<span class="n">thread_vy</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_vthread_y</span><span class="p">),</span> <span class="s">"vthread"</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s">"vy"</span><span class="p">)</span>
+<span class="n">thread_vx</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_vthread_x</span><span class="p">),</span> <span class="s">"vthread"</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s">"vx"</span><span class="p">)</span>
+<span class="n">thread_y</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_y</span><span class="p">),</span> <span class="s">"threadIdx.y"</span><span class="p">)</span>
+<span class="n">thread_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_thread_x</span><span class="p">),</span> <span class="s">"threadIdx.x"</span><span class="p">)</span>
 <span class="c1"># split the dimension of height (H in NCHW) twice
-</span><span class="n">tvy</span><span class="p">,</span> <span class="n">vyi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">h_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_vthread_y</span><span class="p">)</span>
-<span class="n">ty</span><span class="p">,</span> <span class="n">yi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">vyi</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_y</span><span class="p">)</span>
+</span><span class="n">tvy</span><span class="p">,</span> <span class="n">vyi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">h_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_vthread_y</span><span class="p">)</span>
+<span class="n">ty</span><span class="p">,</span> <span class="n">yi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">vyi</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_y</span><span class="p">)</span>
 <span class="c1"># split the dimension of width (W in NCHW) twice
-</span><span class="n">tvx</span><span class="p">,</span> <span class="n">vxi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">w_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_vthread_x</span><span class="p">)</span>
-<span class="n">tx</span><span class="p">,</span> <span class="n">xi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">vxi</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_x</span><span class="p">)</span>
+</span><span class="n">tvx</span><span class="p">,</span> <span class="n">vxi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">w_dim</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_vthread_x</span><span class="p">)</span>
+<span class="n">tx</span><span class="p">,</span> <span class="n">xi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">vxi</span><span class="p">,</span> <span class="n">nparts</span><span class="o">=</span><span class="n">num_thread_x</span><span class="p">)</span>
 <span class="c1"># bind thread and vthread respectively
-</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">tvy</span><span class="p">,</span> <span class="n">thread_vy</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">tvx</span><span class="p">,</span> <span class="n">thread_vx</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">ty</span><span class="p">,</span> <span class="n">thread_y</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">tx</span><span class="p">,</span> <span class="n">thread_x</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">].</span><span class="n">reorder</span><span class="p">(</span><span class="n">tvy</span><span class="p">,</span> <span class="n">tvx</span><span class="p">,</span> <span class="n">ty</span><span class="p">,</span> <span class="n">tx</span><span class="p">,</span> <span class="n">yi</span><span class="p">,</span> <span class="n">xi</span><span class="p">)</span>
+</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">tvy</span><span class="p">,</span> <span class="n">thread_vy</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">tvx</span><span class="p">,</span> <span class="n">thread_vx</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">ty</span><span class="p">,</span> <span class="n">thread_y</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">tx</span><span class="p">,</span> <span class="n">thread_x</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">]</span><span class="o">.</span><span class="n">reorder</span><span class="p">(</span><span class="n">tvy</span><span class="p">,</span> <span class="n">tvx</span><span class="p">,</span> <span class="n">ty</span><span class="p">,</span> <span class="n">tx</span><span class="p">,</span> <span class="n">yi</span><span class="p">,</span> <span class="n">xi</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>Let’s print the IR to see what vthread does:</p>
@@ -536,7 +536,7 @@ We can pass <code class="language-plaintext highlighter-rouge">num_thread_y</cod
 <span class="p">}</span>
 </code></pre></div></div>
 
-<p>As we can see, when <code class="language-plaintext highlighter-rouge">num_vthread_y = 2</code> and <code class="language-plaintext highlighter-rouge">num_vthread_x = 2</code>, the 32 x 32 channel is divided into four sub-channels of 16 x 16.
+<p>As we can see, when <code class="highlighter-rouge">num_vthread_y = 2</code> and <code class="highlighter-rouge">num_vthread_x = 2</code>, the 32 x 32 channel is divided into four sub-channels of 16 x 16.
 Each thread computes four output elements at a time, one element in one sub-channel.</p>
 
 <p>Below is the result with Filter = [256, 1, 3, 3], stride = [1, 1], blocking_h = 32, blocking_w = 32:</p>
@@ -592,7 +592,7 @@ table th:nth-of-type(2) {
   </tbody>
 </table>
 
-<p>Case 2 is faster than case 1. It’s because in case 2 <code class="language-plaintext highlighter-rouge">num_thread_x=8</code> and <code class="language-plaintext highlighter-rouge">num_vthread_x=4</code> together ensures that consecutive threads access consecutive memory addresses,
+<p>Case 2 is faster than case 1. It’s because in case 2 <code class="highlighter-rouge">num_thread_x=8</code> and <code class="highlighter-rouge">num_vthread_x=4</code> together ensures that consecutive threads access consecutive memory addresses,
 thus avoiding bank conflicts, as illustrated below (each color represents one thread’s workload):</p>
 
 <p style="text-align: center"><img src="/images/depthconv_tutorial/vthread_and_strided_pattern.png" alt="image" width="90%" /></p>
@@ -656,17 +656,17 @@ vthread saves additional 5us.</p>
 <p>One typical optimization we can do in deep learning is operator fusion, that computes multiple operators together in a single kernel without saving intermediate results back to global memory.
 TVM supports that out of the box.</p>
 
-<p>Consider a common pattern in neural networks: <code class="language-plaintext highlighter-rouge">depthwise_conv2d</code> + <code class="language-plaintext highlighter-rouge">scale_shift</code> + <code class="language-plaintext highlighter-rouge">relu</code>. We can fuse the three operators into one, by slightly modifying the original schedule:</p>
+<p>Consider a common pattern in neural networks: <code class="highlighter-rouge">depthwise_conv2d</code> + <code class="highlighter-rouge">scale_shift</code> + <code class="highlighter-rouge">relu</code>. We can fuse the three operators into one, by slightly modifying the original schedule:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">DepthwiseConv2d</span> <span class="o">=</span> <span class="n">topi</span><span class="p">.</span><span class="n">nn</span><span class="p">.</span><span class="n">depthwise_conv2d</span><span class="p">(</span><span class="n">Input</span><span class="p">,</span> <span class="n">Filter</span><span class="p">,</span> <span class="n">stride</span><span class="p">,</span> <span [...]
-<span class="n">ScaleShift</span> <span class="o">=</span> <span class="n">topi</span><span class="p">.</span><span class="n">nn</span><span class="p">.</span><span class="n">scale_shift</span><span class="p">(</span><span class="n">DepthwiseConv2d</span><span class="p">,</span> <span class="n">Scale</span><span class="p">,</span> <span class="n">Shift</span><span class="p">)</span>
-<span class="n">Relu</span> <span class="o">=</span> <span class="n">topi</span><span class="p">.</span><span class="n">nn</span><span class="p">.</span><span class="n">relu</span><span class="p">(</span><span class="n">ScaleShift</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">DepthwiseConv2d</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">depthwise_conv2d</span><span class="p">(</span><span class="n">Input</span><span class="p">,</span> <span class="n">Filter</span><span class="p">,</span> <span class="n">stride</span><span class="p">,</span> <span [...]
+<span class="n">ScaleShift</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">scale_shift</span><span class="p">(</span><span class="n">DepthwiseConv2d</span><span class="p">,</span> <span class="n">Scale</span><span class="p">,</span> <span class="n">Shift</span><span class="p">)</span>
+<span class="n">Relu</span> <span class="o">=</span> <span class="n">topi</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">ScaleShift</span><span class="p">)</span>
 
 <span class="n">Output</span> <span class="o">=</span> <span class="n">Relu</span> <span class="c1"># is no longer DepthwiseConv2d
-</span><span class="n">s</span><span class="p">[</span><span class="n">ScaleShift</span><span class="p">].</span><span class="n">compute_inline</span><span class="p">()</span> <span class="c1"># this line fuses ScaleShift, explicitly
-</span><span class="n">s</span><span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">].</span><span class="n">set_scope</span><span class="p">(</span><span class="s">"local"</span><span class="p">)</span> <span class="c1"># this line fuses DepthwiseConv2d, implicitly
+</span><span class="n">s</span><span class="p">[</span><span class="n">ScaleShift</span><span class="p">]</span><span class="o">.</span><span class="n">compute_inline</span><span class="p">()</span> <span class="c1"># this line fuses ScaleShift, explicitly
+</span><span class="n">s</span><span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">]</span><span class="o">.</span><span class="n">set_scope</span><span class="p">(</span><span class="s">"local"</span><span class="p">)</span> <span class="c1"># this line fuses DepthwiseConv2d, implicitly
 </span><span class="n">schedule</span><span class="p">(</span><span class="n">Output</span><span class="p">)</span> <span class="c1"># schedule for Output the same way we schedule for DepthwiseConv2d as discussed above
-</span><span class="n">s</span><span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">].</span><span class="n">compute_at</span><span class="p">(</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">],</span> <span class="n">tx</span><span class="p">)</span> <span class="c1"># tx is the inner most axis, bound to threadIdx.x
+</span><span class="n">s</span><span class="p">[</span><span class="n">DepthwiseConv2d</span><span class="p">]</span><span class="o">.</span><span class="n">compute_at</span><span class="p">(</span><span class="n">s</span><span class="p">[</span><span class="n">Output</span><span class="p">],</span> <span class="n">tx</span><span class="p">)</span> <span class="c1"># tx is the inner most axis, bound to threadIdx.x
 </span></code></pre></div></div>
 
 <p>It generates IR like this:</p>
@@ -699,14 +699,14 @@ TVM supports that out of the box.</p>
 <span class="p">}</span>
 </code></pre></div></div>
 
-<p>As we can see, each thread computes <code class="language-plaintext highlighter-rouge">scale_shift</code> and <code class="language-plaintext highlighter-rouge">relu</code> before writing the result of <code class="language-plaintext highlighter-rouge">depthwise_conv2d</code> to global memory. The fused operator is as fast as single <code class="language-plaintext highlighter-rouge">depthwise_conv2d</code>.
+<p>As we can see, each thread computes <code class="highlighter-rouge">scale_shift</code> and <code class="highlighter-rouge">relu</code> before writing the result of <code class="highlighter-rouge">depthwise_conv2d</code> to global memory. The fused operator is as fast as single <code class="highlighter-rouge">depthwise_conv2d</code>.
 Below is the result with Input = [1, 256, 96, 96], Filter = [256, 1, 3, 3], stride = [1, 1], padding = ‘SAME’:</p>
 
 <ul>
-  <li>tf-1.2 <code class="language-plaintext highlighter-rouge">depthwise_conv2d</code>: 251.6 us</li>
-  <li>tf-1.2 <code class="language-plaintext highlighter-rouge">depthwise_conv2d</code> + <code class="language-plaintext highlighter-rouge">scale_shift</code> + <code class="language-plaintext highlighter-rouge">relu</code> (separate): 419.9 us</li>
-  <li>TVM <code class="language-plaintext highlighter-rouge">depthwise_conv2d</code>: 90.9 us</li>
-  <li>TVM <code class="language-plaintext highlighter-rouge">depthwise_conv2d + scale_shift + relu</code> (fused): 91.5 us</li>
+  <li>tf-1.2 <code class="highlighter-rouge">depthwise_conv2d</code>: 251.6 us</li>
+  <li>tf-1.2 <code class="highlighter-rouge">depthwise_conv2d</code> + <code class="highlighter-rouge">scale_shift</code> + <code class="highlighter-rouge">relu</code> (separate): 419.9 us</li>
+  <li>TVM <code class="highlighter-rouge">depthwise_conv2d</code>: 90.9 us</li>
+  <li>TVM <code class="highlighter-rouge">depthwise_conv2d + scale_shift + relu</code> (fused): 91.5 us</li>
 </ul>
 
 <p>The advantage of operator fusion is obvious.</p>
diff --git a/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html b/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
index 34dd1c5..07f0cb6 100644
--- a/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
+++ b/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
@@ -187,7 +187,7 @@
 
 <ul>
   <li>Loads Resnet 50 model from <a href="https://mxnet.incubator.apache.org/versions/master/api/python/gluon/model_zoo.html">the Gluon model zoo</a></li>
-  <li>Converts Gluon Resnet 50 model to NNVM graph format, using <code class="language-plaintext highlighter-rouge">nnvm.frontend.from_mxnet (...)</code></li>
+  <li>Converts Gluon Resnet 50 model to NNVM graph format, using <code class="highlighter-rouge">nnvm.frontend.from_mxnet (...)</code></li>
   <li>Compiles and executes the graph with ROCm backend</li>
 </ul>
 
@@ -204,7 +204,7 @@ TVM prediction top-1: 282 tiger cat</code></pre></figure>
 
 <p>The script <a href="https://github.com/ROCmSoftwarePlatform/nnvm-rocm/blob/master/advanced_superres_onnx.py">advanced_superres_onnx.py</a> gives an example of loading a model trained with PyTorch. The model is stored in the <a href="https://onnx.ai/">ONNX</a> format. In this example, our network takes an low resolution image as input, and outputs a 4x high resolution image. We refer the details of a problem setup and the network architecture to <a href="https://arxiv.org/abs/1609.0480 [...]
 
-<p>In order to use models in the ONNX format with NNVM, we first use <a href="https://github.com/onnx/onnx">the ONNX library</a> to load the ONNX model into the Protocol buffer object. We can then use <code class="language-plaintext highlighter-rouge">nnvm.frontend.from_onnx(...)</code> to obtain an equivalent NNVM graph. With a NNVM graph in hand, we can follow the generic workflow of compilation and graph execution outlined above.</p>
+<p>In order to use models in the ONNX format with NNVM, we first use <a href="https://github.com/onnx/onnx">the ONNX library</a> to load the ONNX model into the Protocol buffer object. We can then use <code class="highlighter-rouge">nnvm.frontend.from_onnx(...)</code> to obtain an equivalent NNVM graph. With a NNVM graph in hand, we can follow the generic workflow of compilation and graph execution outlined above.</p>
 
 <p style="text-align: center"><img src="/images/rocm/butterfly.png" alt="image" /></p>
 
@@ -229,46 +229,46 @@ We are starting to look at performance optimization and we expect more improveme
 <span class="kn">import</span> <span class="nn">tvm</span>
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="n">np</span>
 
-<span class="n">n</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">var</span><span class="p">(</span><span class="s">"n"</span><span class="p">)</span>
-<span class="n">A</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,),</span> <span class="n">name</span><span class="o">=</span><span class="s">'A'</span><span class="p">)</span>
-<span class="n">B</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,),</span> <span class="n">name</span><span class="o">=</span><span class="s">'B'</span><span class="p">)</span>
-<span class="n">C</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span><span class="n">A</span><span class="p">.</span><span class="n">shape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="n">A</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">B</span><span class= [...]
-<span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">C</span><span class="p">.</span><span class="n">op</span><span class="p">)</span>
-<span class="n">bx</span><span class="p">,</span> <span class="n">tx</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">C</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">factor</span><span class="o [...]
-<span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">bx</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">))</span>
-<span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">tx</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.x"</span><span class="p">))</span></code></pre></figure>
+<span class="n">n</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="s">"n"</span><span class="p">)</span>
+<span class="n">A</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,),</span> <span class="n">name</span><span class="o">=</span><span class="s">'A'</span><span class="p">)</span>
+<span class="n">B</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,),</span> <span class="n">name</span><span class="o">=</span><span class="s">'B'</span><span class="p">)</span>
+<span class="n">C</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span><span class="n">A</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="n">A</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">B</span><span class= [...]
+<span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">op</span><span class="p">)</span>
+<span class="n">bx</span><span class="p">,</span> <span class="n">tx</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">fact [...]
+<span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">bx</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">))</span>
+<span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">tx</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.x"</span><span class="p">))</span></code></pre></figure>
 
 <p>Next, to use ROCm backend we build our kernel under “rocm” target. This will cause TVM to use our new code generator. We also need a runtime context for ROCm backend.</p>
 
 <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">target</span> <span class="o">=</span> <span class="s">"rocm"</span>
-<span class="n">fadd_rocm</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">],</span> <span class="n">target</span><span class="p">,</span> <span class="n">target_host</span><span class="o">=</span [...]
-<span class="n">ctx</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">rocm</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span></code></pre></figure>
+<span class="n">fadd_rocm</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">],</span> <span class="n">target</span><span class="p">,</span> <span class="n">target_host</span><span class="o">=</span [...]
+<span class="n">ctx</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">rocm</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span></code></pre></figure>
 
 <p>After building the kernel and setting up a runtime context, we can launch our vector add kernel.</p>
 
 <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">n</span> <span class="o">=</span> <span class="mi">1024</span>
-<span class="n">a</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="p">.</span><span class="n">random</span><span class="p">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="n">n</span><span class="p">).</span><span class="n">astype</span><span  [...]
-<span class="n">b</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="p">.</span><span class="n">random</span><span class="p">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="n">n</span><span class="p">).</span><span class="n">astype</span><span  [...]
-<span class="n">c</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="p">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">C</span><span class="p">.</span><span class="n">dtype</span><span class=" [...]
+<span class="n">a</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class= [...]
+<span class="n">b</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class= [...]
+<span class="n">c</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">C</span><span class="o">.</span><span class="n">dtype</span><span class=" [...]
 
 <span class="n">fadd_rocm</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span>
-<span class="n">np</span><span class="p">.</span><span class="n">testing</span><span class="p">.</span><span class="n">assert_allclose</span><span class="p">(</span><span class="n">c</span><span class="p">.</span><span class="n">asnumpy</span><span class="p">(),</span> <span class="n">a</span><span class="p">.</span><span class="n">asnumpy</span><span class="p">()</span> <span class="o">+</span> <span class="n">b</span><span class="p">.</span><span class="n">asnumpy</span><span class="p" [...]
+<span class="n">np</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_allclose</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">(),</span> <span class="n">a</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">()</span> <span class="o">+</span> <span class="n">b</span><span class="o">.</span><span class="n">asnumpy</span><span class="p" [...]
 
 <p>We can view LLVM IR that TVM generates in the following way:</p>
 
-<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">dev_module</span> <span class="o">=</span> <span class="n">fadd_rocm</span><span class="p">.</span><span class="n">imported_modules</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
-<span class="k">print</span><span class="p">(</span><span class="n">dev_module</span><span class="p">.</span><span class="n">get_source</span><span class="p">(</span><span class="s">"llvm"</span><span class="p">))</span></code></pre></figure>
+<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">dev_module</span> <span class="o">=</span> <span class="n">fadd_rocm</span><span class="o">.</span><span class="n">imported_modules</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+<span class="k">print</span><span class="p">(</span><span class="n">dev_module</span><span class="o">.</span><span class="n">get_source</span><span class="p">(</span><span class="s">"llvm"</span><span class="p">))</span></code></pre></figure>
 
 <p>You should see something like this:</p>
 
 <figure class="highlight"><pre><code class="language-llvm" data-lang="llvm"><span class="c1">; ModuleID = 'myadd__kernel0'</span>
-<span class="k">source_filename</span> <span class="p">=</span> <span class="s">"myadd__kernel0"</span>
+<span class="err">source_filename</span> <span class="p">=</span> <span class="s">"myadd__kernel0"</span>
 <span class="k">target</span> <span class="k">datalayout</span> <span class="p">=</span> <span class="s">"e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"</span>
 <span class="k">target</span> <span class="k">triple</span> <span class="p">=</span> <span class="s">"amdgcn-amd-amdhsa-hcc"</span>
 
 
 <span class="c1">; Function Attrs: nounwind</span>
-<span class="k">define</span> <span class="k">dllexport</span> <span class="k">amdgpu_kernel</span> <span class="kt">void</span> <span class="vg">@myadd__kernel0</span><span class="p">(</span><span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="k">noalias</span> <span class="k">nocapture</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class="p [...]
+<span class="k">define</span> <span class="k">dllexport</span> <span class="err">amdgpu_kernel</span> <span class="kt">void</span> <span class="vg">@myadd__kernel0</span><span class="p">(</span><span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="k">noalias</span> <span class="k">nocapture</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class= [...]
 <span class="nl">entry:</span>
   <span class="nv">%4</span> <span class="p">=</span> <span class="k">tail</span> <span class="k">call</span> <span class="kt">i32</span> <span class="vg">@llvm.amdgcn.workgroup.id.x</span><span class="p">()</span>
   <span class="nv">%5</span> <span class="p">=</span> <span class="k">tail</span> <span class="k">call</span> <span class="kt">i32</span> <span class="vg">@llvm.amdgcn.workitem.id.x</span><span class="p">()</span>
@@ -312,7 +312,7 @@ We are starting to look at performance optimization and we expect more improveme
 
 <p>We can also view GPU assembly that ROCm backend generates. This is the real code that runs on your GPU.</p>
 
-<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="k">print</span><span class="p">(</span><span class="n">dev_module</span><span class="p">.</span><span class="n">get_source</span><span class="p">(</span><span class="s">"asm"</span><span class="p">))</span></code></pre></figure>
+<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="k">print</span><span class="p">(</span><span class="n">dev_module</span><span class="o">.</span><span class="n">get_source</span><span class="p">(</span><span class="s">"asm"</span><span class="p">))</span></code></pre></figure>
 
 <p>The assembly should look something like this, omitting unnecessary details:</p>
 
diff --git a/2017/11/08/android-rpc-introduction.html b/2017/11/08/android-rpc-introduction.html
index 73c061d..f4b581f 100644
--- a/2017/11/08/android-rpc-introduction.html
+++ b/2017/11/08/android-rpc-introduction.html
@@ -185,7 +185,7 @@ In order to optimize a computation task, one has to edit the code on the develop
 
 <h2 id="run-tvm-app-on-android-phone">Run TVM APP on Android Phone</h2>
 
-<p>You can find Android RPC APP in <a href="https://github.com/dmlc/tvm/tree/master/apps/android_rpc">apps/android_rpc</a>. Please follow the instruction to build for your Android device. Once the APK is built, sign it using <code class="language-plaintext highlighter-rouge">apps/android_rpc/dev_tools</code> and install it on the phone. The APP looks like:</p>
+<p>You can find Android RPC APP in <a href="https://github.com/dmlc/tvm/tree/master/apps/android_rpc">apps/android_rpc</a>. Please follow the instruction to build for your Android device. Once the APK is built, sign it using <code class="highlighter-rouge">apps/android_rpc/dev_tools</code> and install it on the phone. The APP looks like:</p>
 
 <p style="text-align: center"><img src="/images/android_rpc/app.png" alt="image" width="25%" />
 <img src="/images/android_rpc/app_error.png" alt="image" width="25%" /></p>
@@ -200,17 +200,17 @@ In order to optimize a computation task, one has to edit the code on the develop
 <p>Now we can connect to the proxy server from the laptop:</p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">rpc</span>
-<span class="n">remote</span> <span class="o">=</span> <span class="n">rpc</span><span class="p">.</span><span class="n">connect</span><span class="p">(</span><span class="s">"0.0.0.0"</span><span class="p">,</span> <span class="mi">9090</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="s">"android"</span><span class="p">)</span>
+<span class="n">remote</span> <span class="o">=</span> <span class="n">rpc</span><span class="o">.</span><span class="n">connect</span><span class="p">(</span><span class="s">"0.0.0.0"</span><span class="p">,</span> <span class="mi">9090</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="s">"android"</span><span class="p">)</span>
 </code></pre></div></div>
 
-<p>This will give us a handler <code class="language-plaintext highlighter-rouge">remote</code> which we can use to communicate with the mobile phone. For instance, the following lines create a 1024x1024 matrix on phone’s GPU:</p>
+<p>This will give us a handler <code class="highlighter-rouge">remote</code> which we can use to communicate with the mobile phone. For instance, the following lines create a 1024x1024 matrix on phone’s GPU:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">A</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span>
-	<span class="n">np</span><span class="p">.</span><span class="n">random</span><span class="p">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="mi">1024</span><span class="p">,</span> <span class="mi">1024</span><span class="p">)).</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">),</span>
-	<span class="n">ctx</span> <span class="o">=</span> <span class="n">remote</span><span class="p">.</span><span class="n">cl</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">A</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span>
+	<span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="mi">1024</span><span class="p">,</span> <span class="mi">1024</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">),</span>
+	<span class="n">ctx</span> <span class="o">=</span> <span class="n">remote</span><span class="o">.</span><span class="n">cl</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
 </code></pre></div></div>
 
-<p>When <code class="language-plaintext highlighter-rouge">A.asnumpy()</code> is called from the laptop, the matrix <code class="language-plaintext highlighter-rouge">A </code>will be copied to phone’s RAM and then transfer to the laptop through the proxy server. The TVM RPC interface is transparent to users.</p>
+<p>When <code class="highlighter-rouge">A.asnumpy()</code> is called from the laptop, the matrix <code class="highlighter-rouge">A </code>will be copied to phone’s RAM and then transfer to the laptop through the proxy server. The TVM RPC interface is transparent to users.</p>
 
 <h2 id="gemm-matrix-multiplication-on-the-phone">GEMM (Matrix Multiplication) on the Phone</h2>
 
@@ -218,34 +218,34 @@ In order to optimize a computation task, one has to edit the code on the develop
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kn">import</span> <span class="nn">tvm</span>
 <span class="k">def</span> <span class="nf">gemm</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">bn</span><span class="p">):</span>
-    <span class="n">A</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'A'</span><span class="p">)</span>
-    <span class="n">B</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'B'</span><span class="p">)</span>
-    <span class="n">k</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'k'</span><span class="p">)</span>
+    <span class="n">A</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'A'</span><span class="p">)</span>
+    <span class="n">B</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'B'</span><span class="p">)</span>
+    <span class="n">k</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'k'</span><span class="p">)</span>
 
-    <span class="n">C</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span>
+    <span class="n">C</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span>
         <span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span>
-        <span class="k">lambda</span> <span class="n">ii</span><span class="p">,</span> <span class="n">jj</span><span class="p">:</span> <span class="n">tvm</span><span class="p">.</span><span class="nb">sum</span><span class="p">(</span><span class="n">A</span><span class="p">[</span><span class="n">ii</span><span class="p">,</span> <span class="n">k</span><span class="p">]</span> <span class="o">*</span> <span class="n">B</span><span class="p">[</span><span class="n">k</span><span cla [...]
+        <span class="k">lambda</span> <span class="n">ii</span><span class="p">,</span> <span class="n">jj</span><span class="p">:</span> <span class="n">tvm</span><span class="o">.</span><span class="nb">sum</span><span class="p">(</span><span class="n">A</span><span class="p">[</span><span class="n">ii</span><span class="p">,</span> <span class="n">k</span><span class="p">]</span> <span class="o">*</span> <span class="n">B</span><span class="p">[</span><span class="n">k</span><span cla [...]
         <span class="n">name</span><span class="o">=</span><span class="s">'C'</span><span class="p">)</span>
 
-    <span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">C</span><span class="p">.</span><span class="n">op</span><span class="p">)</span>
+    <span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">op</span><span class="p">)</span>
 
-    <span class="n">block_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">)</span>
-    <span class="n">thread_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.x"</span><span class="p">)</span>
+    <span class="n">block_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">)</span>
+    <span class="n">thread_x</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.x"</span><span class="p">)</span>
 
-    <span class="n">bo</span><span class="p">,</span> <span class="n">bi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">C</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">factor</span><span clas [...]
-    <span class="n">to</span><span class="p">,</span> <span class="n">ti</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">C</span><span class="p">.</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">factor</span><span clas [...]
-    <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">bi</span><span class="p">,</span> <span class="n">block_x</span><span class="p">)</span>
-    <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">ti</span><span class="p">,</span> <span class="n">thread_x</span><span class="p">)</span>
+    <span class="n">bo</span><span class="p">,</span> <span class="n">bi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n"> [...]
+    <span class="n">to</span><span class="p">,</span> <span class="n">ti</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">C</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n"> [...]
+    <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">bi</span><span class="p">,</span> <span class="n">block_x</span><span class="p">)</span>
+    <span class="n">s</span><span class="p">[</span><span class="n">C</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">ti</span><span class="p">,</span> <span class="n">thread_x</span><span class="p">)</span>
 
-    <span class="k">print</span><span class="p">(</span><span class="n">tvm</span><span class="p">.</span><span class="n">lower</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">],</span> <span class="n">simple_mode</span><span class="o">=</span><span class="bp">True</span><span class="p">))</span>
+    <span class="k">print</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">lower</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">],</span> <span class="n">simple_mode</span><span class="o">=</span><span class="bp">True</span><span class="p">))</span>
 
-    <span class="k">return</span> <span class="n">tvm</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">],</span>
+    <span class="k">return</span> <span class="n">tvm</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">C</span><span class="p">],</span>
     	<span class="s">"opencl"</span><span class="p">,</span>
     	<span class="n">target_host</span><span class="o">=</span><span class="s">"llvm -target=arm64-linux-android"</span><span class="p">,</span>
     	<span class="n">name</span><span class="o">=</span><span class="s">"gemm_gpu"</span><span class="p">)</span>
 </code></pre></div></div>
 
-<p>There’s nothing special except the last line. Here we set the target to ‘opencl’ since this is the computation language which our Mali GPU supports. Note that we set <code class="language-plaintext highlighter-rouge">target_host</code> to ‘<code class="language-plaintext highlighter-rouge">llvm -target=arm64-linux-android</code>’, it depends on what architecture your Android Phone is. We tested on Samsung Galaxy S6 Edge, which has a Mali-T760 GPU. Here is the CPU info for this phone,</p>
+<p>There’s nothing special except the last line. Here we set the target to ‘opencl’ since this is the computation language which our Mali GPU supports. Note that we set <code class="highlighter-rouge">target_host</code> to ‘<code class="highlighter-rouge">llvm -target=arm64-linux-android</code>’, it depends on what architecture your Android Phone is. We tested on Samsung Galaxy S6 Edge, which has a Mali-T760 GPU. Here is the CPU info for this phone,</p>
 
 <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nv">$ </span>adb shell
 shell@zenltechn:/ <span class="nv">$ </span><span class="nb">cat</span> /proc/cpuinfo
@@ -270,17 +270,17 @@ Hardware	: SAMSUNG Exynos7420
 
 <p>Please refer to <a href="https://clang.llvm.org/docs/CrossCompilation.html#target-triple">target triple</a> to learn the compile options for LLVM.</p>
 
-<p>We use <code class="language-plaintext highlighter-rouge">tvm.contrib.ndk</code> to build the shared library for the Android system,</p>
+<p>We use <code class="highlighter-rouge">tvm.contrib.ndk</code> to build the shared library for the Android system,</p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kn">from</span> <span class="nn">tvm.contrib</span> <span class="kn">import</span> <span class="n">rpc</span><span class="p">,</span> <span class="n">util</span><span class="p">,</span> <span class="n">ndk</span>
 <span class="n">N</span> <span class="o">=</span> <span class="mi">1024</span>
 <span class="n">f</span> <span class="o">=</span> <span class="n">gemm</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">bn</span> <span class="o">=</span> <span class="mi">256</span><span class="p">)</span>
-<span class="n">temp</span> <span class="o">=</span> <span class="n">util</span><span class="p">.</span><span class="n">tempdir</span><span class="p">()</span>
-<span class="n">path_dso</span> <span class="o">=</span> <span class="n">temp</span><span class="p">.</span><span class="n">relpath</span><span class="p">(</span><span class="s">"gemm_gpu.so"</span><span class="p">)</span>
-<span class="n">f</span><span class="p">.</span><span class="n">export_library</span><span class="p">(</span><span class="n">path_dso</span><span class="p">,</span> <span class="n">ndk</span><span class="p">.</span><span class="n">create_shared</span><span class="p">)</span>
+<span class="n">temp</span> <span class="o">=</span> <span class="n">util</span><span class="o">.</span><span class="n">tempdir</span><span class="p">()</span>
+<span class="n">path_dso</span> <span class="o">=</span> <span class="n">temp</span><span class="o">.</span><span class="n">relpath</span><span class="p">(</span><span class="s">"gemm_gpu.so"</span><span class="p">)</span>
+<span class="n">f</span><span class="o">.</span><span class="n">export_library</span><span class="p">(</span><span class="n">path_dso</span><span class="p">,</span> <span class="n">ndk</span><span class="o">.</span><span class="n">create_shared</span><span class="p">)</span>
 </code></pre></div></div>
 
-<p><code class="language-plaintext highlighter-rouge">ndk.create_shared</code> reads the environment variable <code class="language-plaintext highlighter-rouge">TVM_NDK_CC</code> to find the compiler &amp; linker for the Android device. We can easily use NDK to generate standalone toolchain for our device. For example, the following commands generate standalone compilers and linkers for ARM64 Android devices.</p>
+<p><code class="highlighter-rouge">ndk.create_shared</code> reads the environment variable <code class="highlighter-rouge">TVM_NDK_CC</code> to find the compiler &amp; linker for the Android device. We can easily use NDK to generate standalone toolchain for our device. For example, the following commands generate standalone compilers and linkers for ARM64 Android devices.</p>
 
 <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nb">cd</span> /opt/android-ndk/build/tools/
 ./make-standalone-toolchain.sh <span class="nt">--platform</span><span class="o">=</span>android-24 <span class="nt">--use-llvm</span> <span class="nt">--arch</span><span class="o">=</span>arm64 <span class="nt">--install-dir</span><span class="o">=</span>/opt/android-toolchain-arm64
@@ -288,34 +288,34 @@ Hardware	: SAMSUNG Exynos7420
 
 <p>If everything goes right, we’ve got a shared library ‘gemm_gpu.so’. Now let’s upload it to the mobile phone, make the phone load the module and get a remote handler,</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">remote</span> <span class="o">=</span> <span class="n">rpc</span><span class="p">.</span><span class="n">connect</span><span class="p">(</span><span class="s">"0.0.0.0"</span><span class="p">,</span> <span class="mi">9090</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="s">"android"</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">remote</span> <span class="o">=</span> <span class="n">rpc</span><span class="o">.</span><span class="n">connect</span><span class="p">(</span><span class="s">"0.0.0.0"</span><span class="p">,</span> <span class="mi">9090</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="s">"android"</span><span class="p">)</span>
 
-<span class="n">remote</span><span class="p">.</span><span class="n">upload</span><span class="p">(</span><span class="n">path_dso</span><span class="p">)</span>
-<span class="n">f</span> <span class="o">=</span> <span class="n">remote</span><span class="p">.</span><span class="n">load_module</span><span class="p">(</span><span class="s">"gemm_gpu.so"</span><span class="p">)</span>
+<span class="n">remote</span><span class="o">.</span><span class="n">upload</span><span class="p">(</span><span class="n">path_dso</span><span class="p">)</span>
+<span class="n">f</span> <span class="o">=</span> <span class="n">remote</span><span class="o">.</span><span class="n">load_module</span><span class="p">(</span><span class="s">"gemm_gpu.so"</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>Create the remote arrays and print the running time,</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">ctx</span> <span class="o">=</span> <span class="n">remote</span><span class="p">.</span><span class="n">cl</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">ctx</span> <span class="o">=</span> <span class="n">remote</span><span class="o">.</span><span class="n">cl</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
 
 <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="n">np</span>
-<span class="n">a_np</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">random</span><span class="p">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">)).</span><span class="n">astype</span><span class="p">(</span><span class="s">"float32"</span><span class="p">)</span>
-<span class="n">b_np</span> <span class="o">=</span> <span class="n">np</span><span class="p">.</span><span class="n">random</span><span class="p">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">)).</span><span class="n">astype</span><span class="p">(</span><span class="s">"float32"</span><span class="p">)</span>
+<span class="n">a_np</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s">"float32"</span>< [...]
+<span class="n">b_np</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s">"float32"</span>< [...]
 
-<span class="n">a</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">a_np</span><span class="p">,</span> <span class="n">ctx</span><span class="p">)</span>
-<span class="n">b</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">b_np</span><span class="p">,</span> <span class="n">ctx</span><span class="p">)</span>
-<span class="n">c</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="p">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="s">"float32"</span><span  [...]
+<span class="n">a</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">a_np</span><span class="p">,</span> <span class="n">ctx</span><span class="p">)</span>
+<span class="n">b</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">b_np</span><span class="p">,</span> <span class="n">ctx</span><span class="p">)</span>
+<span class="n">c</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="s">"float32"</span><span  [...]
 
-<span class="n">time_f</span> <span class="o">=</span> <span class="n">f</span><span class="p">.</span><span class="n">time_evaluator</span><span class="p">(</span><span class="n">f</span><span class="p">.</span><span class="n">entry_name</span><span class="p">,</span> <span class="n">ctx</span><span class="p">,</span> <span class="n">number</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
-<span class="n">cost</span> <span class="o">=</span> <span class="n">time_f</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">).</span><span class="n">mean</span>
-<span class="k">print</span><span class="p">(</span><span class="s">'%g secs/op, %g GFLOPS'</span> <span class="o">%</span> <span class="p">(</span><span class="n">cost</span><span class="p">,</span> <span class="n">ngflops</span><span class="p">(</span><span class="n">N</span><span class="p">)</span> <span class="o">/</span> <span class="n">cost</span><span class="p">))</span>
+<span class="n">time_f</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">time_evaluator</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">entry_name</span><span class="p">,</span> <span class="n">ctx</span><span class="p">,</span> <span class="n">number</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
+<span class="n">cost</span> <span class="o">=</span> <span class="n">time_f</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span>
+<span class="k">print</span><span class="p">(</span><span class="s">'</span><span class="si">%</span><span class="s">g secs/op, </span><span class="si">%</span><span class="s">g GFLOPS'</span> <span class="o">%</span> <span class="p">(</span><span class="n">cost</span><span class="p">,</span> <span class="n">ngflops</span><span class="p">(</span><span class="n">N</span><span class="p">)</span> <span class="o">/</span> <span class="n">cost</span><span class="p">))</span>
 </code></pre></div></div>
 
 <p>Now we can verify the results on PC,</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">np</span><span class="p">.</span><span class="n">testing</span><span class="p">.</span><span class="n">assert_almost_equal</span><span class="p">(</span>
-	<span class="n">c</span><span class="p">.</span><span class="n">asnumpy</span><span class="p">(),</span>
-	<span class="n">a_np</span><span class="p">.</span><span class="n">dot</span><span class="p">(</span><span class="n">b_np</span><span class="p">),</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">np</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_almost_equal</span><span class="p">(</span>
+	<span class="n">c</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">(),</span>
+	<span class="n">a_np</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">b_np</span><span class="p">),</span>
 	<span class="n">decimal</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
 </code></pre></div></div>
 
diff --git a/2018/01/16/opt-mali-gpu.html b/2018/01/16/opt-mali-gpu.html
index 7a0cb9d..795539f 100644
--- a/2018/01/16/opt-mali-gpu.html
+++ b/2018/01/16/opt-mali-gpu.html
@@ -228,7 +228,7 @@ not require explicit vectorization. But also notice that the newer
 Mali Bitfrost GPUs are based on quad-style vectorization and does not
 require explicit vectorization.</li>
   <li>All threads in Mali GPUs have individual program counters. It means
-the <code class="language-plaintext highlighter-rouge">warp size</code> is 1, so that branch divergence is not a major problem.</li>
+the <code class="highlighter-rouge">warp size</code> is 1, so that branch divergence is not a major problem.</li>
 </ul>
 
 <h1 id="optimization--convolution-as-example">Optimization : Convolution as Example</h1>
@@ -299,15 +299,15 @@ tiling so that we can access the memory sequentially, which reduces
 cache miss rate.</p>
 
 <p>We do tiling on the width dimension of the input image and CO dimension
-of the filter matrix.  This is described by <code class="language-plaintext highlighter-rouge">tvm.compute</code>.</p>
+of the filter matrix.  This is described by <code class="highlighter-rouge">tvm.compute</code>.</p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1"># set tiling factor
 </span><span class="n">VH</span> <span class="o">=</span> <span class="mi">1</span>
 <span class="n">VW</span> <span class="o">=</span> <span class="n">VC</span> <span class="o">=</span> <span class="mi">4</span>
 
 <span class="c1"># get input shape
-</span> <span class="n">_</span><span class="p">,</span> <span class="n">CI</span><span class="p">,</span> <span class="n">IH</span><span class="p">,</span> <span class="n">IW</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="n">shape</span>
-<span class="n">CO</span><span class="p">,</span> <span class="n">CI</span><span class="p">,</span> <span class="n">KH</span><span class="p">,</span> <span class="n">KW</span> <span class="o">=</span> <span class="n">kernel</span><span class="p">.</span><span class="n">shape</span>
+</span> <span class="n">_</span><span class="p">,</span> <span class="n">CI</span><span class="p">,</span> <span class="n">IH</span><span class="p">,</span> <span class="n">IW</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span>
+<span class="n">CO</span><span class="p">,</span> <span class="n">CI</span><span class="p">,</span> <span class="n">KH</span><span class="p">,</span> <span class="n">KW</span> <span class="o">=</span> <span class="n">kernel</span><span class="o">.</span><span class="n">shape</span>
 <span class="n">TH</span> <span class="o">=</span> <span class="n">IH</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">H_PAD</span>
 <span class="n">TW</span> <span class="o">=</span> <span class="n">IW</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">W_PAD</span>
 
@@ -325,33 +325,33 @@ of the filter matrix.  This is described by <code class="language-plaintext high
 <span class="n">oshape</span> <span class="o">=</span> <span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">CO</span><span class="p">,</span> <span class="n">OH</span><span class="p">,</span> <span class="n">OW</span><span class="p">)</span>
 
 <span class="c1"># define packing
-</span><span class="n">data_vec</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span><span class="n">dvshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">n</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">, [...]
+</span><span class="n">data_vec</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span><span class="n">dvshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">n</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">, [...]
     <span class="n">data_pad</span><span class="p">[</span><span class="n">n</span><span class="p">][</span><span class="n">ci</span><span class="p">][</span><span class="n">h</span><span class="o">*</span><span class="n">VH</span><span class="o">*</span><span class="n">H_STRIDE</span><span class="o">+</span><span class="n">vh</span><span class="p">][</span><span class="n">w</span><span class="o">*</span><span class="n">VW</span><span class="o">*</span><span class="n">W_STRIDE</span><spa [...]
 
-<span class="n">kernel_vec</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span><span class="n">kvshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span><span class="p">:</span>
+<span class="n">kernel_vec</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span><span class="n">kvshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span><span class="p">:</span>
     <span class="n">kernel</span><span class="p">[</span><span class="n">co</span><span class="o">*</span><span class="n">VC</span><span class="o">+</span><span class="n">vc</span><span class="p">][</span><span class="n">ci</span><span class="p">][</span><span class="n">kh</span><span class="p">][</span><span class="n">kw</span><span class="p">],</span> <span class="n">name</span><span class="o">=</span><span class="s">'kernel_vec'</span><span class="p">)</span>
 
 <span class="c1"># define convolution
-</span><span class="n">ci</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">CI</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'ci'</span><span class="p">)</span>
-<span class="n">kh</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">KH</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'kh'</span><span class="p">)</span>
-<span class="n">kw</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">KW</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'kw'</span><span class="p">)</span>
+</span><span class="n">ci</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">CI</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'ci'</span><span class="p">)</span>
+<span class="n">kh</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">KH</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'kh'</span><span class="p">)</span>
+<span class="n">kw</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">KW</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'kw'</span><span class="p">)</span>
 
-<span class="n">conv</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span><span class="n">ovshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">n</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <sp [...]
-    <span class="n">tvm</span><span class="p">.</span><span class="nb">sum</span><span class="p">(</span><span class="n">data_vec</span><span class="p">[</span><span class="n">n</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="o">*</span><span class="n">H_STRIDE</span><span class="o">+</span><span class="n">kh</span><span  [...]
-            <span class="n">kernel_vec</span><span class="p">[</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span><span class="p">].</span><span class="n">astype</span><span class="p">(</span><span class="n">out_dtype</span><span class="p">),</span>
+<span class="n">conv</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span><span class="n">ovshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">n</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <sp [...]
+    <span class="n">tvm</span><span class="o">.</span><span class="nb">sum</span><span class="p">(</span><span class="n">data_vec</span><span class="p">[</span><span class="n">n</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="o">*</span><span class="n">H_STRIDE</span><span class="o">+</span><span class="n">kh</span><span  [...]
+            <span class="n">kernel_vec</span><span class="p">[</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">out_dtype</span><span class="p">),</span>
             <span class="n">axis</span><span class="o">=</span><span class="p">[</span><span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">]),</span> <span class="n">name</span><span class="o">=</span><span class="s">'conv'</span><span class="p">)</span>
 
 <span class="c1"># unpack to correct layout
-</span><span class="n">output</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">(</span><span class="n">oshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">n</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">:</span>
+</span><span class="n">output</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">(</span><span class="n">oshape</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">n</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">:</span>
                      <span class="n">conv</span><span class="p">[</span><span class="n">n</span><span class="p">][</span><span class="n">co</span><span class="o">//</span><span class="n">VC</span><span class="p">][</span><span class="n">h</span><span class="o">/</span><span class="n">VH</span><span class="p">][</span><span class="n">w</span><span class="o">//</span><span class="n">VW</span><span class="p">][</span><span class="n">h</span><span class="o">%</span><span class="n">VH</span>< [...]
                      <span class="n">name</span><span class="o">=</span><span class="s">'output_unpack'</span><span class="p">,</span> <span class="n">tag</span><span class="o">=</span><span class="s">'direct_conv_output'</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>We can inspect the defined IR by</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">print</span><span class="p">(</span><span class="n">tvm</span><span class="p">.</span><span class="n">lower</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">data</span><span class="p">,</span> <span class="n">kernel</span><span class="p">,</span> <span class="n">output</span><span class="p">],</span> <span [...]
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">print</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">lower</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">data</span><span class="p">,</span> <span class="n">kernel</span><span class="p">,</span> <span class="n">output</span><span class="p">],</span> <span [...]
 </code></pre></div></div>
 <p>I pick the convolution part here.</p>
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>produce conv {
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>produce conv {
   for (co, 0, 64) {
     for (h, 0, 56) {
       for (w, 0, 14) {
@@ -390,35 +390,35 @@ our code can run on Mali GPU.</p>
     <span class="s">""" tile and bind 3d """</span>
     <span class="n">y_factor</span> <span class="o">=</span> <span class="n">y_factor</span> <span class="ow">or</span> <span class="n">z_factor</span>
     <span class="n">x_factor</span> <span class="o">=</span> <span class="n">x_factor</span> <span class="ow">or</span> <span class="n">y_factor</span>
-    <span class="n">zo</span><span class="p">,</span> <span class="n">zi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">z</span><span class="p">,</span> <span class="n">z_factor</span><span class="p">)</span>
-    <span class="n">yo</span><span class="p">,</span> <span class="n">yi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_factor</span><span class="p">)</span>
-    <span class="n">xo</span><span class="p">,</span> <span class="n">xi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">split</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">x_factor</span><span class="p">)</span>
-    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">zo</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.z"</span><span class="p">))</span>
-    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">zi</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.z"</span><span class="p">))</span>
-    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">yo</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.y"</span><span class="p">))</span>
-    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">yi</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.y"</span><span class="p">))</span>
-    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">xo</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">))</span>
-    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">].</span><span class="n">bind</span><span class="p">(</span><span class="n">xi</span><span class="p">,</span> <span class="n">tvm</span><span class="p">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.x"</span><span class="p">))</span>
+    <span class="n">zo</span><span class="p">,</span> <span class="n">zi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">z</span><span class="p">,</span> <span class="n">z_factor</span><span class="p">)</span>
+    <span class="n">yo</span><span class="p">,</span> <span class="n">yi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">y_factor</span><span class="p">)</span>
+    <span class="n">xo</span><span class="p">,</span> <span class="n">xi</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">x_factor</span><span class="p">)</span>
+    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">zo</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.z"</span><span class="p">))</span>
+    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">zi</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.z"</span><span class="p">))</span>
+    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">yo</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.y"</span><span class="p">))</span>
+    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">yi</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.y"</span><span class="p">))</span>
+    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">xo</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"blockIdx.x"</span><span class="p">))</span>
+    <span class="n">s</span><span class="p">[</span><span class="n">tensor</span><span class="p">]</span><span class="o">.</span><span class="n">bind</span><span class="p">(</span><span class="n">xi</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">thread_axis</span><span class="p">(</span><span class="s">"threadIdx.x"</span><span class="p">))</span>
 
 <span class="c1"># set tunable parameter
 </span><span class="n">num_thread</span> <span class="o">=</span> <span class="mi">8</span>
 
 <span class="c1"># schedule data packing
-</span><span class="n">_</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+</span><span class="n">_</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class [...]
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">data_vec</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="c1"># schedule kernel packing
-</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span>
 <span class="n">tile_and_bind</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">kernel_vec</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="c1"># schedule conv
-</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">op</span><span class=" [...]
-<span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">reduce_axis</span>
+</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n" [...]
+<span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">reduce_axis</span>
 
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">reorder</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">, [...]
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">reorder</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">kc</span><span class="p">,</span> <span class="n">kh< [...]
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">conv</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">num_thread</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
-<span class="n">_</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">output</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+<span class="n">_</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">output</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span>
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span><span class="p">,</span> <span class="n">num_thread</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 </code></pre></div></div>
 
@@ -452,41 +452,41 @@ our code can run on Mali GPU.</p>
 <h3 id="kernel-2-unrolling">Kernel 2: unrolling</h3>
 <p>Loop unrolling can reduce the instructions for loop control, reduce
 branch penalties and hide latency in reading memory.
-In TVM, this can be done easily by calling <code class="language-plaintext highlighter-rouge">s.unroll(axis)</code></p>
+In TVM, this can be done easily by calling <code class="highlighter-rouge">s.unroll(axis)</code></p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1"># set tunable parameter
 </span><span class="n">num_thread</span> <span class="o">=</span> <span class="mi">8</span>
 
 <span class="c1"># schedule data packing
-</span><span class="n">_</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+</span><span class="n">_</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class [...]
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">data_vec</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="s">"""!! ADD UNROLL HERE !!"""</span>
-<span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
 
 <span class="c1"># schedule kernel packing
-</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span>
 <span class="n">tile_and_bind</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">kernel_vec</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="s">"""!! ADD UNROLL HERE !!"""</span>
-<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
 
 <span class="c1"># schedule conv
-</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">op</span><span class=" [...]
-<span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">reduce_axis</span>
+</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n" [...]
+<span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">reduce_axis</span>
 
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">reorder</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">, [...]
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">reorder</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">kc</span><span class="p">,</span> <span class="n">kh< [...]
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">conv</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">num_thread</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="s">"""!! ADD UNROLL HERE !!"""</span>
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
 
-<span class="n">_</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">output</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+<span class="n">_</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">output</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span>
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span><span class="p">,</span> <span class="n">num_thread</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 </code></pre></div></div>
 
@@ -529,37 +529,37 @@ In TVM, this can be done easily by calling <code class="language-plaintext highl
 </span><span class="n">num_thread</span> <span class="o">=</span> <span class="mi">8</span>
 
 <span class="c1"># schedule data packing
-</span><span class="n">_</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+</span><span class="n">_</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class [...]
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">data_vec</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="c1"># unroll
-</span><span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
+</span><span class="n">s</span><span class="p">[</span><span class="n">data_vec</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
 
 <span class="c1"># schedule kernel packing
-</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+</span><span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span>
 <span class="n">tile_and_bind</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">kernel_vec</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">ci</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="c1"># unroll
-</span><span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
+</span><span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
 <span class="s">"""!! VECTORIZE HERE !!"""</span>
-<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">].</span><span class="n">vectorize</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">kernel_vec</span><span class="p">]</span><span class="o">.</span><span class="n">vectorize</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
 
 <span class="c1"># schedule conv
-</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">op</span><span class=" [...]
-<span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">reduce_axis</span>
+</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">vw</span><span class="p">,</span> <span class="n">vc</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n" [...]
+<span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">,</span> <span class="n">kw</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">reduce_axis</span>
 
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">reorder</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">kc</span><span class="p">,</span> <span class="n">kh</span><span class="p">, [...]
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">reorder</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">vh</span><span class="p">,</span> <span class="n">kc</span><span class="p">,</span> <span class="n">kh< [...]
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">conv</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">num_thread</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 
 <span class="c1"># unroll
-</span><span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
+</span><span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kh</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">kw</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">unroll</span><span class="p">(</span><span class="n">vw</span><span class="p">)</span>
 <span class="s">"""!! VECTORIZE HERE !!"""</span>
-<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">].</span><span class="n">vectorize</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
+<span class="n">s</span><span class="p">[</span><span class="n">conv</span><span class="p">]</span><span class="o">.</span><span class="n">vectorize</span><span class="p">(</span><span class="n">vc</span><span class="p">)</span>
 
-<span class="n">_</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">output</span><span class="p">].</span><span class="n">op</span><span class="p">.</span><span class="n">axis</span>
+<span class="n">_</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span> <span class="o">=</span> <span class="n">s</span><span class="p">[</span><span class="n">output</span><span class="p">]</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">axis</span>
 <span class="n">tile_and_bind3d</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">co</span><span class="p">,</span> <span class="n">oh</span><span class="p">,</span> <span class="n">ow</span><span class="p">,</span> <span class="n">num_thread</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
 </code></pre></div></div>
 
@@ -602,7 +602,7 @@ In TVM, this can be done easily by calling <code class="language-plaintext highl
 
 <h3 id="how-to-set-the-tunable-parameter">How to set the tunable parameter</h3>
 <p>As for the tunable parameters above, some can be calculated.
-For the vectorized dimension <code class="language-plaintext highlighter-rouge">VC</code>, we should fill the 128-bit register,
+For the vectorized dimension <code class="highlighter-rouge">VC</code>, we should fill the 128-bit register,
 so it can be set as 128/32=4 for float32 and 128/16=8 for float16.</p>
 
 <p>But more often we cannot determine the optimal value, due to the
@@ -612,7 +612,7 @@ IR rather than direct OpenCL code.</p>
 
 <h3 id="the-generated-opencl-code">The generated OpenCL code</h3>
 <p>We can view the generated OpenCL code by</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">print</span><span class="p">(</span><span class="n">func</span><span class="p">.</span><span class="n">imported_modules</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">get_source</span><span class="p">())</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">print</span><span class="p">(</span><span class="n">func</span><span class="o">.</span><span class="n">imported_modules</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">get_source</span><span class="p">())</span>
 </code></pre></div></div>
 <p>The OpenCL code is too long to be pasted here, and it is hard to read due
 to heavy unrolling. If interested, you can view it
@@ -623,7 +623,7 @@ to heavy unrolling. If interested, you can view it
 different backends on some popular deep neural networks.
 Our test environment is</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>Firefly-RK3399 4G
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>Firefly-RK3399 4G
 CPU: dual-core Cortex-A72 + quad-core Cortex-A53
 GPU: Mali-T860MP4
 
diff --git a/2018/03/23/nmt-transformer-optimize.html b/2018/03/23/nmt-transformer-optimize.html
index 1c54d60..b85019d 100644
--- a/2018/03/23/nmt-transformer-optimize.html
+++ b/2018/03/23/nmt-transformer-optimize.html
@@ -189,7 +189,7 @@ One paricular challenge we observed, is that batch matmul is a major performance
 
 <p>Batch matmul computation can be described more concretely as follows:</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>void BatchedGemm(input A, input B, output C, M, N, K, batch_dimension) {
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>void BatchedGemm(input A, input B, output C, M, N, K, batch_dimension) {
   for (int i = 0; i &lt; batch_dimension; ++i)  {
     DoGemm(A[i],B[i],C[i],M,K,N)
   }
@@ -258,7 +258,7 @@ One paricular challenge we observed, is that batch matmul is a major performance
 
 <p>In TVM, a general batch matmul computation can be declared as:</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code># computation representation
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code># computation representation
 A = tvm.placeholder((batch, M, K), name='A')
 B = tvm.placeholder((batch, K, N), name='B')
 k = tvm.reduce_axis((0, K), 'k')
@@ -273,7 +273,7 @@ C = tvm.compute((batch, M, N),
 
 <h3 id="tuning-parameters-of-blockthread-numbers">Tuning parameters of block/thread numbers</h3>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>  # thread indices
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>  # thread indices
   block_y = tvm.thread_axis("blockIdx.y")
   block_x = tvm.thread_axis("blockIdx.x")
   thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
@@ -298,9 +298,9 @@ C = tvm.compute((batch, M, N),
   s[C].bind(vty, thread_yz)
   s[C].bind(vtx, thread_xz)
 </code></pre></div></div>
-<p>We fuse the outer dimensions of the batch matmul, i.e. the BB and FF of the op’s dimension, normally known as “batch” dimension in batch matmul computation. Then we split the outer and the inner dimensions by a factor of (<code class="language-plaintext highlighter-rouge">number_thread * vthread</code>).</p>
+<p>We fuse the outer dimensions of the batch matmul, i.e. the BB and FF of the op’s dimension, normally known as “batch” dimension in batch matmul computation. Then we split the outer and the inner dimensions by a factor of (<code class="highlighter-rouge">number_thread * vthread</code>).</p>
 
-<p>Strided pattern is not needed in batch matmul, thus the virtual thread number (<code class="language-plaintext highlighter-rouge">vthread_y</code> and <code class="language-plaintext highlighter-rouge">vthread_x</code>) are both set to 1.</p>
+<p>Strided pattern is not needed in batch matmul, thus the virtual thread number (<code class="highlighter-rouge">vthread_y</code> and <code class="highlighter-rouge">vthread_x</code>) are both set to 1.</p>
 
 <h4 id="finding-the-best-combination-of-number_thread">Finding the best combination of number_thread</h4>
 
@@ -349,7 +349,7 @@ C = tvm.compute((batch, M, N),
   </tbody>
 </table>
 
-<p>As learned from <a href="http://tvmlang.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html">past experience</a>, the method to find the best combination of <code class="language-plaintext highlighter-rouge">num_thread_y</code> and <code class="language-plaintext highlighter-rouge">num_thread_x</code> is through brute-force search. After a brute-force search, the best combination for current shape can be found, which in current computation [...]
+<p>As learned from <a href="http://tvmlang.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html">past experience</a>, the method to find the best combination of <code class="highlighter-rouge">num_thread_y</code> and <code class="highlighter-rouge">num_thread_x</code> is through brute-force search. After a brute-force search, the best combination for current shape can be found, which in current computation is <code class="highlighter-rouge">nu [...]
 
 <h2 id="fuse-batch-matmul-with-other-operations">Fuse batch matmul with other operations</h2>
 
@@ -359,7 +359,7 @@ C = tvm.compute((batch, M, N),
 
 <p>Batch matmul and broadcast add fusion computation can be declared as follows:</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code># computation representation
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code># computation representation
 A = tvm.placeholder((batch_size, features, M, K), name='A')
 # the shape of B is (N, K) other than (K, N) is because B is transposed is this fusion pattern
 B = tvm.placeholder((batch_size, features, N, K), name='B')
@@ -374,7 +374,7 @@ D = topi.broadcast_add(C, ENTER)
 
 <p>Batch matmul and transpose fusion computation can be declared as:</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code># computation representation
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code># computation representation
 A = tvm.placeholder((batch_size, features, M, K), name='A')
 B = tvm.placeholder((batch_size, features, K, N), name='B')
 k = tvm.reduce_axis((0, K), 'k')
@@ -388,10 +388,10 @@ C = tvm.compute(
 <p>The shape of [batch=64, heads=8, M=1, N=17, K=128] is chosen to elaborate the performance of the generated code. 17 is chosen as the sequence length since it is the average input length in our production scenarios.</p>
 
 <ul>
-  <li>tf-r1.4 <code class="language-plaintext highlighter-rouge">BatchMatmul</code>: 513.9 us</li>
-  <li>tf-r1.4 <code class="language-plaintext highlighter-rouge">BatchMatmul</code> + <code class="language-plaintext highlighter-rouge">Transpose</code> (separate): 541.9 us</li>
-  <li>TVM <code class="language-plaintext highlighter-rouge">BatchMatmul</code>: 37.62 us</li>
-  <li>TVM <code class="language-plaintext highlighter-rouge">BatchMatmul</code> + <code class="language-plaintext highlighter-rouge">Transpose</code> (fused): 38.39 us</li>
+  <li>tf-r1.4 <code class="highlighter-rouge">BatchMatmul</code>: 513.9 us</li>
+  <li>tf-r1.4 <code class="highlighter-rouge">BatchMatmul</code> + <code class="highlighter-rouge">Transpose</code> (separate): 541.9 us</li>
+  <li>TVM <code class="highlighter-rouge">BatchMatmul</code>: 37.62 us</li>
+  <li>TVM <code class="highlighter-rouge">BatchMatmul</code> + <code class="highlighter-rouge">Transpose</code> (fused): 38.39 us</li>
 </ul>
 
 <p>The kernel fusion optimization brings a further <b><em>1.7X</em></b> speed-up.</p>
diff --git a/2018/08/10/DLPack-Bridge.html b/2018/08/10/DLPack-Bridge.html
index 27008e7..9849d29 100644
--- a/2018/08/10/DLPack-Bridge.html
+++ b/2018/08/10/DLPack-Bridge.html
@@ -226,21 +226,21 @@ Figure 1</p>
 
 <p>First, we compute a reference output in PyTorch:</p>
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    <span class="kn">import</span> <span class="nn">torch</span>
-    <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="p">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">)</span>
-    <span class="n">y</span> <span class="o">=</span> <span class="n">torch</span><span class="p">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">)</span>
-    <span class="n">z</span> <span class="o">=</span> <span class="n">x</span><span class="p">.</span><span class="n">mm</span><span class="p">(</span><span class="n">y</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">)</span>
+    <span class="n">y</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">)</span>
+    <span class="n">z</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">mm</span><span class="p">(</span><span class="n">y</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>We then define and build a TVM matrix multiplication operator, using the default
 schedule:</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    <span class="n">n</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">convert</span><span class="p">(</span><span class="mi">56</span><span class="p">)</span>
-    <span class="n">X</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,</span><span class="n">n</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'X'</span><span class="p">)</span>
-    <span class="n">Y</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,</span><span class="n">n</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'Y'</span><span class="p">)</span>
-
-    <span class="n">k</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">n</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'k'</span><span class="p">)</span>
-    <span class="n">Z</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">((</span><span class="n">n</span><span class="p">,</span><span class="n">n</span><span class="p">),</span> <span class="k">lambda</span> <span class="n">i</span><span class="p">,</span><span class="n">j</span> <span class="p">:</span> <span class="n">tvm</span><span class="p">.</span><span class="nb">sum</span><span class="p">(</span><span  [...]
-    <span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">Z</span><span class="p">.</span><span class="n">op</span><span class="p">)</span>
-    <span class="n">fmm</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">Z</span><span class="p">],</span> <span class="n">target_host</span><span class="o">=</span><span class="s">'llvm'</span><span class="p">,</span> < [...]
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    <span class="n">n</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="mi">56</span><span class="p">)</span>
+    <span class="n">X</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,</span><span class="n">n</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'X'</span><span class="p">)</span>
+    <span class="n">Y</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">placeholder</span><span class="p">((</span><span class="n">n</span><span class="p">,</span><span class="n">n</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'Y'</span><span class="p">)</span>
+
+    <span class="n">k</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">n</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'k'</span><span class="p">)</span>
+    <span class="n">Z</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">((</span><span class="n">n</span><span class="p">,</span><span class="n">n</span><span class="p">),</span> <span class="k">lambda</span> <span class="n">i</span><span class="p">,</span><span class="n">j</span> <span class="p">:</span> <span class="n">tvm</span><span class="o">.</span><span class="nb">sum</span><span class="p">(</span><span  [...]
+    <span class="n">s</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">create_schedule</span><span class="p">(</span><span class="n">Z</span><span class="o">.</span><span class="n">op</span><span class="p">)</span>
+    <span class="n">fmm</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">Z</span><span class="p">],</span> <span class="n">target_host</span><span class="o">=</span><span class="s">'llvm'</span><span class="p">,</span> < [...]
 </code></pre></div></div>
 <p>For brevity, we do not cover TVM’s large collection of scheduling primitives
 that we can use to optimize matrix multiplication. If you wish to make a custom
@@ -252,37 +252,37 @@ found <a href="https://tvm.apache.org/docs//tutorials/optimize/opt_gemm.html">he
     <span class="c1"># fmm is the previously built TVM function (Python function)
 </span>    <span class="c1"># fmm is the wrapped TVM function (Python function)
 </span>    <span class="n">fmm_pytorch</span> <span class="o">=</span> <span class="n">to_pytorch_func</span><span class="p">(</span><span class="n">fmm</span><span class="p">)</span>
-    <span class="n">z2</span> <span class="o">=</span> <span class="n">torch</span><span class="p">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">)</span>
+    <span class="n">z2</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">)</span>
     <span class="n">fmm_pytorch</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z2</span><span class="p">)</span>
-    <span class="n">np</span><span class="p">.</span><span class="n">testing</span><span class="p">.</span><span class="n">assert_allclose</span><span class="p">(</span><span class="n">z</span><span class="p">.</span><span class="n">numpy</span><span class="p">(),</span> <span class="n">z2</span><span class="p">.</span><span class="n">numpy</span><span class="p">())</span>
+    <span class="n">np</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_allclose</span><span class="p">(</span><span class="n">z</span><span class="o">.</span><span class="n">numpy</span><span class="p">(),</span> <span class="n">z2</span><span class="o">.</span><span class="n">numpy</span><span class="p">())</span>
 </code></pre></div></div>
 <p>and verify that the results match.</p>
 
 <p>We can repeat the same example, but using MxNet instead:</p>
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    <span class="kn">import</span> <span class="nn">mxnet</span>
     <span class="kn">from</span> <span class="nn">tvm.contrib.mxnet</span> <span class="kn">import</span> <span class="n">to_mxnet_func</span>
-    <span class="n">ctx</span> <span class="o">=</span> <span class="n">mxnet</span><span class="p">.</span><span class="n">cpu</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
-    <span class="n">x</span> <span class="o">=</span> <span class="n">mxnet</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
-    <span class="n">y</span> <span class="o">=</span> <span class="n">mxnet</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
-    <span class="n">z</span> <span class="o">=</span> <span class="n">mxnet</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">empty</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
-    <span class="n">f</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">Z</span><span class="p">],</span> <span class="n">target_host</span><span class="o">=</span><span class="s">'llvm'</span><span class="p">,</span> <sp [...]
+    <span class="n">ctx</span> <span class="o">=</span> <span class="n">mxnet</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">mxnet</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
+    <span class="n">y</span> <span class="o">=</span> <span class="n">mxnet</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
+    <span class="n">z</span> <span class="o">=</span> <span class="n">mxnet</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">56</span><span class="p">,</span><span class="mi">56</span><span class="p">),</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
+    <span class="n">f</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="p">[</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">Z</span><span class="p">],</span> <span class="n">target_host</span><span class="o">=</span><span class="s">'llvm'</span><span class="p">,</span> <sp [...]
     <span class="n">f_mxnet</span> <span class="o">=</span> <span class="n">to_mxnet_func</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
     <span class="n">f_mxnet</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">z</span><span class="p">)</span>
-    <span class="n">np</span><span class="p">.</span><span class="n">testing</span><span class="p">.</span><span class="n">assert_allclose</span><span class="p">(</span><span class="n">z</span><span class="p">.</span><span class="n">asnumpy</span><span class="p">(),</span> <span class="n">x</span><span class="p">.</span><span class="n">asnumpy</span><span class="p">().</span><span class="n">dot</span><span class="p">(</span><span class="n">y</span><span class="p">.</span><span class="n"> [...]
+    <span class="n">np</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_allclose</span><span class="p">(</span><span class="n">z</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">(),</span> <span class="n">x</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">()</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">y</span><span class="o">. [...]
 </code></pre></div></div>
 
 <h2 id="under-the-hood-of-the-pytorch-example">Under the hood of the PyTorch Example</h2>
-<p>As TVM provides <a href="https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L455">functions</a> to convert dlpack tensors to tvm <code class="language-plaintext highlighter-rouge">NDArray</code>s and
+<p>As TVM provides <a href="https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L455">functions</a> to convert dlpack tensors to tvm <code class="highlighter-rouge">NDArray</code>s and
 vice-versa, so all that is needed is some syntactic sugar by wrapping functions.
-<code class="language-plaintext highlighter-rouge">convert_func</code> is a generic converter for frameworks using tensors with dlpack
+<code class="highlighter-rouge">convert_func</code> is a generic converter for frameworks using tensors with dlpack
 support, and can be used to implement convenient converters, such as
-<code class="language-plaintext highlighter-rouge">to_pytorch_func</code>.</p>
+<code class="highlighter-rouge">to_pytorch_func</code>.</p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">def</span> <span class="nf">convert_func</span><span class="p">(</span><span class="n">tvm_func</span><span class="p">,</span> <span class="n">tensor_type</span><span class="p">,</span> <span class="n">to_dlpack_func</span><span class="p">):</span>
     <span class="k">assert</span> <span class="nb">callable</span><span class="p">(</span><span class="n">tvm_func</span><span class="p">)</span>
 
     <span class="k">def</span> <span class="nf">_wrapper</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">):</span>
-        <span class="n">args</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">ndarray</span><span class="p">.</span><span class="n">from_dlpack</span><span class="p">(</span><span class="n">to_dlpack_func</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span>\
+        <span class="n">args</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">ndarray</span><span class="o">.</span><span class="n">from_dlpack</span><span class="p">(</span><span class="n">to_dlpack_func</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span>\
             <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">tensor_type</span><span class="p">)</span> <span class="k">else</span> <span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">)</span>
         <span class="k">return</span> <span class="n">tvm_func</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span>
 
@@ -291,7 +291,7 @@ support, and can be used to implement convenient converters, such as
 <span class="k">def</span> <span class="nf">to_pytorch_func</span><span class="p">(</span><span class="n">tvm_func</span><span class="p">):</span>
     <span class="kn">import</span> <span class="nn">torch</span>
     <span class="kn">import</span> <span class="nn">torch.utils.dlpack</span>
-    <span class="k">return</span> <span class="n">convert_func</span><span class="p">(</span><span class="n">tvm_func</span><span class="p">,</span> <span class="n">torch</span><span class="p">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">torch</span><span class="p">.</span><span class="n">utils</span><span class="p">.</span><span class="n">dlpack</span><span class="p">.</span><span class="n">to_dlpack</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">convert_func</span><span class="p">(</span><span class="n">tvm_func</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">dlpack</span><span class="o">.</span><span class="n">to_dlpack</span><span class="p">)</span>
 </code></pre></div></div>
 
     </div>
diff --git a/2018/10/03/auto-opt-all.html b/2018/10/03/auto-opt-all.html
index ae1982b..87f8122 100644
--- a/2018/10/03/auto-opt-all.html
+++ b/2018/10/03/auto-opt-all.html
@@ -216,9 +216,9 @@ Links to tutorials for ARM CPU, Mali GPU, NVIDIA GPU, AMD GPU are all available
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kn">from</span> <span class="nn">mxnet.gluon.model_zoo.vision</span> <span class="kn">import</span> <span class="n">get_model</span>
 
 <span class="n">block</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">(</span><span class="s">'resnet18_v1'</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
-<span class="n">net</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">nnvm</span><span class="p">.</span><span class="n">frontend</span><span class="p">.</span><span class="n">from_mxnet</span><span class="p">(</span><span class="n">block</span><span class="p">)</span>
+<span class="n">net</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">nnvm</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span><span class="p">(</span><span class="n">block</span><span class="p">)</span>
 
-<span class="n">tasks</span> <span class="o">=</span> <span class="n">autotvm</span><span class="p">.</span><span class="n">extract_from_graph</span><span class="p">(</span><span class="n">net</span><span class="p">)</span>
+<span class="n">tasks</span> <span class="o">=</span> <span class="n">autotvm</span><span class="o">.</span><span class="n">extract_from_graph</span><span class="p">(</span><span class="n">net</span><span class="p">)</span>
 <span class="n">tune_tasks</span><span class="p">(</span><span class="n">tasks</span><span class="p">,</span> <span class="o">**</span><span class="n">tuning_option</span><span class="p">)</span>
 </code></pre></div></div>
 <p>There are 12 different conv2d layers in resnet-18, so we launch 12 tuning tasks.
@@ -226,7 +226,7 @@ For each of them, the tuner makes several hundreds of trials and picks the best
 After finishing all tuning tasks, we compile the whole network and generate a single deployable minimal library.
 One sample output is</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>Extract tasks...
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>Extract tasks...
 Tuning...
 [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
 [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
diff --git a/2018/12/18/lowprecision-conv.html b/2018/12/18/lowprecision-conv.html
index 46906ef..997729a 100644
--- a/2018/12/18/lowprecision-conv.html
+++ b/2018/12/18/lowprecision-conv.html
@@ -232,8 +232,8 @@ Finally, the output is computed in an unpacked format and in higher precision.</
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">Input_bitpacked</span> <span class="o">=</span> <span class="n">bitpack</span><span class="p">(</span><span class="n">Input</span><span class="p">,</span> <span class="n">activation_bits</span><span class="p">,</span> <span class="n">pack_axis</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">bit_axis</span><span class="o">=</spa [...]
 <span class="n">Weights_bitpacked</span> <span class="o">=</span> <span class="n">bitpack</span><span class="p">(</span><span class="n">Filter</span><span class="p">,</span> <span class="n">weight_bits</span><span class="p">,</span> <span class="n">pack_axis</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">bit_axis</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">pack_type</span><span class="o"> [...]
-<span class="n">batch</span><span class="p">,</span> <span class="n">in_height</span><span class="p">,</span> <span class="n">in_width</span><span class="p">,</span> <span class="n">in_channel_q</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">Input_bitpacked</span><span class="p">.</span><span class="n">shape</span>
-<span class="n">kernel_h</span><span class="p">,</span> <span class="n">kernel_w</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">num_filter</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">Filter_bitpakced</span><span class="p">.</span><span class="n">shape</span>
+<span class="n">batch</span><span class="p">,</span> <span class="n">in_height</span><span class="p">,</span> <span class="n">in_width</span><span class="p">,</span> <span class="n">in_channel_q</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">Input_bitpacked</span><span class="o">.</span><span class="n">shape</span>
+<span class="n">kernel_h</span><span class="p">,</span> <span class="n">kernel_w</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">num_filter</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">Filter_bitpakced</span><span class="o">.</span><span class="n">shape</span>
 
 <span class="n">stride_h</span><span class="p">,</span> <span class="n">stride_w</span> <span class="o">=</span> <span class="n">stride</span>
 <span class="n">pad_top</span><span class="p">,</span> <span class="n">pad_left</span><span class="p">,</span> <span class="n">pad_down</span><span class="p">,</span> <span class="n">pad_right</span> <span class="o">=</span> <span class="n">get_pad_tuple</span><span class="p">(</span><span class="n">padding</span><span class="p">,</span> <span class="p">(</span><span class="n">kernel_h</span><span class="p">,</span> <span class="n">kernel_w</span><span class="p">))</span>
@@ -247,17 +247,17 @@ Finally, the output is computed in an unpacked format and in higher precision.</
 <span class="n">Input_padded</span> <span class="o">=</span> <span class="n">pad</span><span class="p">(</span><span class="n">Input_bitpacked</span><span class="p">,</span> <span class="n">pad_before</span><span class="p">,</span> <span class="n">pad_after</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s">"PaddedInput"</span><span class="p">)</span>
 
 <span class="c1"># Treat the bitplane axes like additional reduction axes
-</span><span class="n">rc</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">in_channel_q</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'rc'</span><span class="p">)</span>
-<span class="n">ry</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">kernel_h</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'ry'</span><span class="p">)</span>
-<span class="n">rx</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">kernel_w</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'rx'</span><span class="p">)</span>
-<span class="n">ib</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">input_bits</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'ib'</span><span class="p">)</span>
-<span class="n">wb</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">weight_bits</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'wb'</span><span class="p">)</span>
+</span><span class="n">rc</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">in_channel_q</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'rc'</span><span class="p">)</span>
+<span class="n">ry</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">kernel_h</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'ry'</span><span class="p">)</span>
+<span class="n">rx</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">kernel_w</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'rx'</span><span class="p">)</span>
+<span class="n">ib</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">input_bits</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'ib'</span><span class="p">)</span>
+<span class="n">wb</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">reduce_axis</span><span class="p">((</span><span class="mi">0</span><span class="p">,</span> <span class="n">weight_bits</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="s">'wb'</span><span class="p">)</span>
 
 
-<span class="n">tvm</span><span class="p">.</span><span class="n">compute</span><span class="p">((</span><span class="n">batch</span><span class="p">,</span> <span class="n">out_height</span><span class="p">,</span> <span class="n">out_width</span><span class="p">,</span> <span class="n">out_channel</span><span class="p">),</span> <span class="k">lambda</span> <span class="n">nn</span><span class="p">,</span> <span class="n">yy</span><span class="p">,</span> <span class="n">xx</span><spa [...]
-             <span class="n">tvm</span><span class="p">.</span><span class="nb">sum</span><span class="p">(</span><span class="n">tvm</span><span class="p">.</span><span class="n">popcount</span><span class="p">(</span>
+<span class="n">tvm</span><span class="o">.</span><span class="n">compute</span><span class="p">((</span><span class="n">batch</span><span class="p">,</span> <span class="n">out_height</span><span class="p">,</span> <span class="n">out_width</span><span class="p">,</span> <span class="n">out_channel</span><span class="p">),</span> <span class="k">lambda</span> <span class="n">nn</span><span class="p">,</span> <span class="n">yy</span><span class="p">,</span> <span class="n">xx</span><spa [...]
+             <span class="n">tvm</span><span class="o">.</span><span class="nb">sum</span><span class="p">(</span><span class="n">tvm</span><span class="o">.</span><span class="n">popcount</span><span class="p">(</span>
                <span class="n">Input_padded</span><span class="p">[</span><span class="n">nn</span><span class="p">,</span> <span class="n">yy</span> <span class="o">*</span> <span class="n">stride_h</span> <span class="o">+</span> <span class="n">ry</span><span class="p">,</span> <span class="n">xx</span> <span class="o">*</span> <span class="n">stride_w</span> <span class="o">+</span> <span class="n">rx</span><span class="p">,</span> <span class="n">rc</span><span class="p">,</span> <s [...]
-               <span class="n">Weights_bitpacked</span><span class="p">[</span><span class="n">ry</span><span class="p">,</span> <span class="n">rx</span><span class="p">,</span> <span class="n">rc</span><span class="p">,</span> <span class="n">ff</span><span class="p">,</span> <span class="n">wb</span><span class="p">]))</span> <span class="o">&lt;&lt;</span> <span class="p">(</span><span class="n">ib</span><span class="o">+</span><span class="n">wb</span><span class="p">))).</span><spa [...]
+               <span class="n">Weights_bitpacked</span><span class="p">[</span><span class="n">ry</span><span class="p">,</span> <span class="n">rx</span><span class="p">,</span> <span class="n">rc</span><span class="p">,</span> <span class="n">ff</span><span class="p">,</span> <span class="n">wb</span><span class="p">]))</span> <span class="o">&lt;&lt;</span> <span class="p">(</span><span class="n">ib</span><span class="o">+</span><span class="n">wb</span><span class="p">)))</span><span [...]
                <span class="n">axis</span><span class="o">=</span><span class="p">[</span><span class="n">rc</span><span class="p">,</span> <span class="n">ry</span><span class="p">,</span> <span class="n">rx</span><span class="p">,</span> <span class="n">wb</span><span class="p">,</span> <span class="n">ib</span><span class="p">]))</span>
 
 </code></pre></div></div>
diff --git a/2019/01/19/Golang.html b/2019/01/19/Golang.html
index ec8dccb..cd312b9 100644
--- a/2019/01/19/Golang.html
+++ b/2019/01/19/Golang.html
@@ -179,12 +179,12 @@ integrates the TVM runtime can load these compiled modules and perform inference
 import and compilation using TVM can be found at <a href="https://tvm.apache.org/docs//tutorials/">tutorials</a>.</p>
 
 <p>TVM now supports deploying compiled modules through Golang. Golang applications can make use of this
-to deploy the deep learning models through TVM. The scope of this blog is the introduction of <code class="language-plaintext highlighter-rouge">gotvm</code> package,
-the package build process and a sample application using <code class="language-plaintext highlighter-rouge">gotvm</code> to load a compiled module and perform inference.</p>
+to deploy the deep learning models through TVM. The scope of this blog is the introduction of <code class="highlighter-rouge">gotvm</code> package,
+the package build process and a sample application using <code class="highlighter-rouge">gotvm</code> to load a compiled module and perform inference.</p>
 
 <h2 id="package">Package</h2>
 
-<p>The golang package <code class="language-plaintext highlighter-rouge">gotvm</code> is built on top of TVM’s C runtime interface. The API in this package
+<p>The golang package <code class="highlighter-rouge">gotvm</code> is built on top of TVM’s C runtime interface. The API in this package
 abstracts the native C types and provides Golang compatible types. The package source can be found
 at <a href="https://github.com/dmlc/tvm/tree/master/golang">gotvm</a>.</p>
 
@@ -197,10 +197,10 @@ necessary conversions across API calls.</p>
 
 <h2 id="how-to">How to</h2>
 
-<p>As shown in the below diagram <code class="language-plaintext highlighter-rouge">gotvm</code> enables golang applications to integrate deep learning models
+<p>As shown in the below diagram <code class="highlighter-rouge">gotvm</code> enables golang applications to integrate deep learning models
 from various frameworks without the hassle of understanding each framework related interface API.
 Developers can make use of TVM to import and compile deep learning models and generate TVM artifacts.
-<code class="language-plaintext highlighter-rouge">gotvm</code> package provides golang friendly API to load, configure, feed input and get output.</p>
+<code class="highlighter-rouge">gotvm</code> package provides golang friendly API to load, configure, feed input and get output.</p>
 
 <p style="text-align: center"><img src="/images/golang/TVM-Golang-Flow.png" alt="image" width="100%" /></p>
 <center> Import, Compile, Integrate and Deploy</center>
@@ -212,8 +212,8 @@ generates the artifacts required to integrate and deploy the model on a target.<
 
 <h2 id="api">API</h2>
 
-<p><code class="language-plaintext highlighter-rouge">gotvm</code> package provides a handful of datatypes and API functions to initialize, load and infer
-from a golang application. Like any other golang package we just need to import <code class="language-plaintext highlighter-rouge">gotvm</code> package here.</p>
+<p><code class="highlighter-rouge">gotvm</code> package provides a handful of datatypes and API functions to initialize, load and infer
+from a golang application. Like any other golang package we just need to import <code class="highlighter-rouge">gotvm</code> package here.</p>
 
 <ul>
   <li>Module : The Module API can be used to load a TVM compiled module into TVM runtime and access any functions.</li>
@@ -302,7 +302,7 @@ For simplicity the error handling is ignored here, but is important in real appl
 <span class="p">}</span>
 </code></pre></div></div>
 
-<p><code class="language-plaintext highlighter-rouge">gotvm</code> extends the TVM packed function system to support golang function closures as packed functions.
+<p><code class="highlighter-rouge">gotvm</code> extends the TVM packed function system to support golang function closures as packed functions.
 <a href="https://github.com/dmlc/tvm/blob/master/golang/sample">Examples</a> available to register golang
 closure as TVM packed function and invoke the same across programming language barriers.</p>
 
diff --git a/2019/04/29/opt-cuda-quantized.html b/2019/04/29/opt-cuda-quantized.html
index 4ce4917..40c7157 100644
--- a/2019/04/29/opt-cuda-quantized.html
+++ b/2019/04/29/opt-cuda-quantized.html
@@ -165,7 +165,7 @@
     <p>Deep learning has been successfully applied to a variety of tasks.
 On real-time scenarios such as inference on autonomous vehicles, the inference speed of the model is critical.
 Network quantization is an effective approach to accelerating deep learning models.
-In quantized models, both data and model parameters are represented with low precision data types such as <code class="language-plaintext highlighter-rouge">int8</code> and <code class="language-plaintext highlighter-rouge">float16</code>.
+In quantized models, both data and model parameters are represented with low precision data types such as <code class="highlighter-rouge">int8</code> and <code class="highlighter-rouge">float16</code>.
 The lowered data bandwidth reduces the inference time and memory/storage requirements, as well as the power consumption.
 Meanwhile, under proper quantization schemes, we can minimize the accuracy drops of the quantized models.
 Therefore, quantized models are of particular interests of researchers and developers as it makes large models suitable to deploy on diverse devices, such as GPU, CPU and mobile devices.</p>
@@ -187,38 +187,38 @@ In emerging models such as ResNeXt and Deformable ConvNets, the automatic optimi
 
 <h1 id="expressing-quantized-cuda-kernels-in-tvm">Expressing Quantized CUDA Kernels in TVM</h1>
 <h2 id="leveraging-tensor-intrinsics-via-tensorization">Leveraging Tensor Intrinsics via Tensorization</h2>
-<p>Many platforms provide architecture-specific instructions for special computation patterns, for example, the SIMD instructions on x86, and the <code class="language-plaintext highlighter-rouge">dp4a</code> and <code class="language-plaintext highlighter-rouge">hfma</code> instructions on CUDA.
+<p>Many platforms provide architecture-specific instructions for special computation patterns, for example, the SIMD instructions on x86, and the <code class="highlighter-rouge">dp4a</code> and <code class="highlighter-rouge">hfma</code> instructions on CUDA.
 These intrinsic instructions are highly optimized for specific devices.
 By leveraging hardware intrinsics, we can achieve a significant performance boost for quantized operators.</p>
 
 <p>Currently, <a href="https://devblogs.nvidia.com/mixed-precision-programming-cuda-8/">dp4a</a> has been extensively used in TVM int8 operators on CUDA.
-<code class="language-plaintext highlighter-rouge">dp4a</code> is a CUDA intrinsic on Compute Capability 6.1 devices.
+<code class="highlighter-rouge">dp4a</code> is a CUDA intrinsic on Compute Capability 6.1 devices.
 It is a mixed-precision instruction that provides the efficient computation of the dot product between two 4-element 8-bit integer vectors and accumulates the result in 32-bit format.
-Using <code class="language-plaintext highlighter-rouge">dp4a</code>, we can implement a dot product between 8-bit integer vectors with number of elements evenly divisible by four.
+Using <code class="highlighter-rouge">dp4a</code>, we can implement a dot product between 8-bit integer vectors with number of elements evenly divisible by four.
 With an efficient dot product operator, we can implement high-level operators such as 2d convolution and dense layers as these operators are commonly backed by dot products.</p>
 
 <p>To illustrate, in 2d convolution we accumulate along the channel, the width, and the height axis of the kernel.
-This is a typical use case of <code class="language-plaintext highlighter-rouge">dp4a</code>.
+This is a typical use case of <code class="highlighter-rouge">dp4a</code>.
 TVM uses tensorization to support calling external intrinsics.
-We do not need to modify the original computation declaration; we use the schedule primitive <code class="language-plaintext highlighter-rouge">tensorize</code> to replace the accumulation with <code class="language-plaintext highlighter-rouge">dp4a</code> tensor intrinsic.
+We do not need to modify the original computation declaration; we use the schedule primitive <code class="highlighter-rouge">tensorize</code> to replace the accumulation with <code class="highlighter-rouge">dp4a</code> tensor intrinsic.
 More details of tensorization can be found in the <a href="https://tvm.apache.org/docs//tutorials/language/tensorize.html">tutorial</a>.</p>
 
 <h2 id="data-layout-rearrangement">Data Layout Rearrangement</h2>
 <p>One of the challenges in tensorization is that we may need to design special computation logic to adapt to the requirement of tensor intrinsics.
-Although it is natural to accumulate along the inner axis of the tensor in the dense operator, <code class="language-plaintext highlighter-rouge">conv2d</code> can be more challenging.
-In <code class="language-plaintext highlighter-rouge">conv2d</code> we expect to take a slice in the channel dimension as the input of <code class="language-plaintext highlighter-rouge">dp4a</code> because the number of channels is typically multiple of 4 (otherwise we fall back to original <code class="language-plaintext highlighter-rouge">conv2d</code> in NCHW layout).
+Although it is natural to accumulate along the inner axis of the tensor in the dense operator, <code class="highlighter-rouge">conv2d</code> can be more challenging.
+In <code class="highlighter-rouge">conv2d</code> we expect to take a slice in the channel dimension as the input of <code class="highlighter-rouge">dp4a</code> because the number of channels is typically multiple of 4 (otherwise we fall back to original <code class="highlighter-rouge">conv2d</code> in NCHW layout).
 Meanwhile, to achieve memory locality, we would like to reduce along the innermost axis first.
 Taking these factors into account, we use a custom data layout to address this challenge.</p>
 
-<p>In CUDA int8 2d convolution, we empirically choose <code class="language-plaintext highlighter-rouge">NCHW4c</code> as data layout and <code class="language-plaintext highlighter-rouge">OIHW4o4i</code> as weight layout.
-The templates can also be easily generalized to <code class="language-plaintext highlighter-rouge">NCHW[x]c</code> and <code class="language-plaintext highlighter-rouge">OIHW[x]o[x]i</code>, where x is an arbitrary positive integer divisible by four.
+<p>In CUDA int8 2d convolution, we empirically choose <code class="highlighter-rouge">NCHW4c</code> as data layout and <code class="highlighter-rouge">OIHW4o4i</code> as weight layout.
+The templates can also be easily generalized to <code class="highlighter-rouge">NCHW[x]c</code> and <code class="highlighter-rouge">OIHW[x]o[x]i</code>, where x is an arbitrary positive integer divisible by four.
 In the data layout we choose, slices of channels are in the packed innermost dimension.
 Likewise, we pack slices in both the input and output channel dimensions of the weight so that the output has a consistent data layout with the input, which prevents redundant layout transformations between layers.</p>
 
 <p>We show the computation of one element of the output of the 2d convolution in Figure 2.
 The element in each position of the super dimension (the outer dimension of the blocked layout which contains packed elements) NCHW and OIHW is the packed input and kernel, respectively.
 Each column of the packed kernel comes from a different filter.
-We calculate the dot product between the packed input and each row in the packed kernel using <code class="language-plaintext highlighter-rouge">dp4a</code>, and accumulate the result to the output tensor.</p>
+We calculate the dot product between the packed input and each row in the packed kernel using <code class="highlighter-rouge">dp4a</code>, and accumulate the result to the output tensor.</p>
 
 <p style="text-align: center"><img src="/images/cuda-quantized/conv2d.png" alt="image" width="60%" /></p>
 <div>
@@ -229,7 +229,7 @@ Figure 2. 2D convolution with data layout in NCHW4c and weight layout in OIHW4o4
 </div>
 <p></p>
 
-<p>After we have specified the layout of convolution layers, other operators such as <code class="language-plaintext highlighter-rouge">add</code> and activations can automatically adapt to the chosen layout during the <a href="https://github.com/dmlc/tvm/blob/master/src/relay/pass/alter_op_layout.cc">AlterOpLayout</a> pass in Relay.
+<p>After we have specified the layout of convolution layers, other operators such as <code class="highlighter-rouge">add</code> and activations can automatically adapt to the chosen layout during the <a href="https://github.com/dmlc/tvm/blob/master/src/relay/pass/alter_op_layout.cc">AlterOpLayout</a> pass in Relay.
 The layout transformation of the weight can be precomputed offline. Therefore, we can run the whole model in the same layout without extra overhead.</p>
 
 <h2 id="designing-search-space-for-automatic-optimization">Designing Search Space for Automatic Optimization</h2>
@@ -241,8 +241,8 @@ For example, as caching data in the shared memory is a common practice in CUDA p
 We also do some manual tiling such as splitting axes by 4 or 16 to facilitate vectorized memory access.</p>
 
 <p>In quantized 2d convolution, we design a search space that includes a set of tunable options, such as the tile size, the axes to fuse, configurations of loop unrolling and double buffering.
-The templates of quantized <code class="language-plaintext highlighter-rouge">conv2d</code> and <code class="language-plaintext highlighter-rouge">dense</code> on CUDA are registered under template key <code class="language-plaintext highlighter-rouge">int8</code>.
-During automatic tuning, we can create tuning tasks for these quantized operators by setting the <code class="language-plaintext highlighter-rouge">template_key</code> argument.
+The templates of quantized <code class="highlighter-rouge">conv2d</code> and <code class="highlighter-rouge">dense</code> on CUDA are registered under template key <code class="highlighter-rouge">int8</code>.
+During automatic tuning, we can create tuning tasks for these quantized operators by setting the <code class="highlighter-rouge">template_key</code> argument.
 Details of how to launch automatic optimization can be found in the <a href="https://tvm.apache.org/docs//tutorials/autotvm/tune_relay_cuda.html">AutoTVM tutorial</a>.</p>
 
 <h1 id="general-workflow">General Workflow</h1>
@@ -253,22 +253,22 @@ Details of how to launch automatic optimization can be found in the <a href="htt
 
 <p>TVM provides an easy workflow to quantize trained models from other frameworks, automatically optimize operators (with AutoTVM), and deploy to different devices.</p>
 
-<p>First, we use the Relay frontend to import existing models. Here we use an MXNet model with <code class="language-plaintext highlighter-rouge">(1, 3, 224, 224)</code> input shape as an example.</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">sym</span><span class="p">,</span> <span class="n">arg_params</span><span class="p">,</span> <span class="n">aux_params</span> <span class="o">=</span> <span class="n">mxnet</span><span class="p">.</span><span class="n">model</span><span class="p">.</span><span class="n">load_checkpoint</span><span class="p">(</span><span class="n">model_path</span><span class="p">,</span> < [...]
-<span class="n">net</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">from_mxnet</span><span class="p">(</span><span class="n">sym</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">{</span><span class="s">'data'</span><span class="p">:</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</sp [...]
+<p>First, we use the Relay frontend to import existing models. Here we use an MXNet model with <code class="highlighter-rouge">(1, 3, 224, 224)</code> input shape as an example.</p>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">sym</span><span class="p">,</span> <span class="n">arg_params</span><span class="p">,</span> <span class="n">aux_params</span> <span class="o">=</span> <span class="n">mxnet</span><span class="o">.</span><span class="n">model</span><span class="o">.</span><span class="n">load_checkpoint</span><span class="p">(</span><span class="n">model_path</span><span class="p">,</span> < [...]
+<span class="n">net</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">from_mxnet</span><span class="p">(</span><span class="n">sym</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">{</span><span class="s">'data'</span><span class="p">:</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</sp [...]
 </code></pre></div></div>
 
 <p>Next, we use the relay quantization API to convert it to a quantized model.</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">net</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">quantize</span><span class="p">.</span><span class="n">quantize</span><span class="p">(</span><span class="n">net</span><span class="p">,</span> <span class="n">params</span><span class="o">=</span><span class="n">params</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">net</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">quantize</span><span class="o">.</span><span class="n">quantize</span><span class="p">(</span><span class="n">net</span><span class="p">,</span> <span class="n">params</span><span class="o">=</span><span class="n">params</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>Then, we use AutoTVM to extract tuning tasks for the operators in the model and perform automatic optimization. The <a href="https://tvm.apache.org/docs//tutorials/autotvm/tune_relay_cuda.html">AutoTVM tutorial</a> provides an example for this.</p>
 
 <p>Finally, we build the model and run inference in the quantized mode.</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">with</span> <span class="n">relay</span><span class="p">.</span><span class="n">build_config</span><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">3</span><span class="p">):</span>
-    <span class="n">graph</span><span class="p">,</span> <span class="n">lib</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">net</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">with</span> <span class="n">relay</span><span class="o">.</span><span class="n">build_config</span><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">3</span><span class="p">):</span>
+    <span class="n">graph</span><span class="p">,</span> <span class="n">lib</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">net</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span>
 </code></pre></div></div>
-<p>The result of <code class="language-plaintext highlighter-rouge">relay.build</code> is a deployable library.
+<p>The result of <code class="highlighter-rouge">relay.build</code> is a deployable library.
 We can either run inference <a href="https://tvm.apache.org/docs//tutorials/frontend/from_mxnet.html#execute-the-portable-graph-on-tvm">on the GPU</a> directly or deploy <a href="https://tvm.apache.org/docs//tutorials/frontend/deploy_model_on_rasp.html#deploy-the-model-remotely-by-rpc">on the remote devices</a> via RPC.</p>
 
 <h1 id="benchmark">Benchmark</h1>
diff --git a/2019/05/30/pytorch-frontend.html b/2019/05/30/pytorch-frontend.html
index cbad9c3..6d3948b 100644
--- a/2019/05/30/pytorch-frontend.html
+++ b/2019/05/30/pytorch-frontend.html
@@ -169,7 +169,7 @@ To that end, PyTorch now has an official TVM-based backend, <a href="https://git
 
 <p>Usage is simple:</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import torch_tvm
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import torch_tvm
 torch_tvm.enable()
 </code></pre></div></div>
 
@@ -187,11 +187,11 @@ torch_tvm.enable()
 
 <p>To support Relay, two features were added to the PyTorch JIT: custom transformation passes and custom subgraph interpreters.</p>
 
-<p>When <code class="language-plaintext highlighter-rouge">torch_tvm</code> is enabled, subgraphs of PyTorch IR that can be converted to Relay <code class="language-plaintext highlighter-rouge">Expr</code>s will be marked as Relay-compatible.  Since PyTorch IR does not always contain shape information, none of the subgraphs can be compiled in a useful way before invocation.</p>
+<p>When <code class="highlighter-rouge">torch_tvm</code> is enabled, subgraphs of PyTorch IR that can be converted to Relay <code class="highlighter-rouge">Expr</code>s will be marked as Relay-compatible.  Since PyTorch IR does not always contain shape information, none of the subgraphs can be compiled in a useful way before invocation.</p>
 
 <p>During user invocation, the PyTorch JIT runtime will determine input shape information and compile the previously marked subgraphs with the new Relay C++ <a href="https://github.com/pytorch/tvm/blob/master/torch_tvm/compiler.cpp#L226-L246">build system</a>.  The compilation is cached based on input shapes for subsequent runs.  More details can be found in the <a href="https://github.com/pytorch/tvm/blob/master/README.md">README</a>.</p>
 
-<p><code class="language-plaintext highlighter-rouge">torch_tvm</code> has a continuous benchmark system set up, which is monitoring the performance of ResNet18 on CPU.
+<p><code class="highlighter-rouge">torch_tvm</code> has a continuous benchmark system set up, which is monitoring the performance of ResNet18 on CPU.
 Out of the box TVM provides over two times the performance of the default PyTorch JIT backend for various ResNet models.
 Below is a graph that details the iterations per second achieved with 16 threads on an AWS c5n.4xlarge instance (larger is better):</p>
 
@@ -207,9 +207,9 @@ Below is a graph that details the iterations per second achieved with 16 threads
 
 <h3 id="tutorial">Tutorial</h3>
 
-<p>If you have an already written PyTorch model, the easiest way to get started comes from using <code class="language-plaintext highlighter-rouge">torch.jit.trace</code> as follows</p>
+<p>If you have an already written PyTorch model, the easiest way to get started comes from using <code class="highlighter-rouge">torch.jit.trace</code> as follows</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import torch_tvm
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import torch_tvm
 from your_model import model, inputs
 
 torch_tvm.enable(opt_level=3)
@@ -237,12 +237,12 @@ with torch.no_grad():
     print("Took {}s to run {} iters".format(tvm_time, iters))
 </code></pre></div></div>
 
-<p>Much of this code comes from <a href="https://github.com/pytorch/tvm/blob/master/test/benchmarks.py">benchmarks.py</a>.  Note that tuned parameters for AVX2 LLVM compilation is in the <code class="language-plaintext highlighter-rouge">test/</code> folder of the repo.</p>
+<p>Much of this code comes from <a href="https://github.com/pytorch/tvm/blob/master/test/benchmarks.py">benchmarks.py</a>.  Note that tuned parameters for AVX2 LLVM compilation is in the <code class="highlighter-rouge">test/</code> folder of the repo.</p>
 
 <p>If you are more comfortable using Relay directly, it is possible to simply extract the expression directly from a
 PyTorch function either via (implicit) tracing or TorchScript:</p>
 
-<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>def add(a, b, c):
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>def add(a, b, c):
     return a + b + c
 
 # via tracing
diff --git a/2020/05/20/bring-your-own-datatypes.html b/2020/05/20/bring-your-own-datatypes.html
index fc97fb1..caf053a 100644
--- a/2020/05/20/bring-your-own-datatypes.html
+++ b/2020/05/20/bring-your-own-datatypes.html
@@ -167,7 +167,7 @@
 <h2 id="introduction">Introduction</h2>
 
 <p>When designing accelerators, an important decision is how one will approximately represent real numbers in hardware.
-This problem has had a longstanding, industry-standard solution: the IEEE 754 floating-point standard.<sup id="fnref:ieee" role="doc-noteref"><a href="#fn:ieee" class="footnote">1</a></sup>
+This problem has had a longstanding, industry-standard solution: the IEEE 754 floating-point standard.<sup id="fnref:ieee"><a href="#fn:ieee" class="footnote">1</a></sup>
 Yet,
   when trying to squeeze
   the most out of hardware
@@ -181,13 +181,13 @@ If we know the numerical requirements
   or more power efficient datatype?
 The answer is yes!
 Researchers have already begun experimenting with new datatypes in academic and industrial accelerator designs.
-For example, Google’s Tensor Processing Unit (the TPU) uses the <code class="language-plaintext highlighter-rouge">bfloat</code> type: a single-precision IEEE float which has been truncated to 16 bits.
+For example, Google’s Tensor Processing Unit (the TPU) uses the <code class="highlighter-rouge">bfloat</code> type: a single-precision IEEE float which has been truncated to 16 bits.
 Due to the lax numerical requirements
   of many deep learning workloads,
   this truncation often has no effect
   on model accuracy,
   while instantly cutting the storage cost
-  in half.<sup id="fnref:jouppi2017datacenter" role="doc-noteref"><a href="#fn:jouppi2017datacenter" class="footnote">2</a></sup><sup id="fnref:tensorflowbfloat" role="doc-noteref"><a href="#fn:tensorflowbfloat" class="footnote">3</a></sup></p>
+  in half.<sup id="fnref:jouppi2017datacenter"><a href="#fn:jouppi2017datacenter" class="footnote">2</a></sup><sup id="fnref:tensorflowbfloat"><a href="#fn:tensorflowbfloat" class="footnote">3</a></sup></p>
 
 <p>Before researchers begin building hardware for their datatype, however, they first need to determine how their datatype will behave numerically in the workloads they care about.
 This often involves first building a software-emulated version of their datatype
@@ -217,8 +217,8 @@ Unlike the posits-in-Tensorflow example above, which enables a single new dataty
   using custom datatypes.
 In the Bring Your Own Datatypes framework,
   “datatype” means a scalar type:
-  <code class="language-plaintext highlighter-rouge">float32</code>
-  or <code class="language-plaintext highlighter-rouge">uint8</code>, for example.
+  <code class="highlighter-rouge">float32</code>
+  or <code class="highlighter-rouge">uint8</code>, for example.
 We do not handle more complicated data formats
   such as <a href="https://en.wikipedia.org/wiki/Block_floating_point" target="_blank">block floating point</a>
   or Intel’s <a href="https://arxiv.org/abs/1711.02213" target="_blank">Flexpoint</a>.
@@ -234,7 +234,7 @@ Additionally,
 A number of these type codes
   have hard-coded meanings in TVM,
   mapping to common datatypes
-  such as <code class="language-plaintext highlighter-rouge">int</code> and <code class="language-plaintext highlighter-rouge">float</code>.
+  such as <code class="highlighter-rouge">int</code> and <code class="highlighter-rouge">float</code>.
 However,
   the vast majority of type codes
   are unused.
@@ -265,21 +265,21 @@ These steps are akin to
   where the type code comes from
   the range of unused type codes
   available to custom datatypes.</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">tvm</span><span class="p">.</span><span class="n">datatype</span><span class="p">.</span><span class="n">register</span><span class="p">(</span><span class="s">'bfloat'</span><span class="p">,</span> <span class="mi">150</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">tvm</span><span class="o">.</span><span class="n">datatype</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s">'bfloat'</span><span class="p">,</span> <span class="mi">150</span><span class="p">)</span>
 </code></pre></div></div>
 <p>The above code registers
-  the <code class="language-plaintext highlighter-rouge">'bfloat'</code> datatype
+  the <code class="highlighter-rouge">'bfloat'</code> datatype
   with type code 150.
 This registration step
   allows TVM to parse programs
   which use the custom type:</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">x</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">var</span><span class="p">(</span><span class="s">'x'</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span clas [...]
-<span class="n">y</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">var</span><span class="p">(</span><span class="s">'y'</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'float32'</span><span class="p">)</span>
-<span class="n">x_bfloat</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">cast</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'custom[bfloat]16'</span><span class="p">)</span>
-<span class="n">y_bfloat</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">cast</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'custom[bfloat]16'</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">x</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="s">'x'</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span clas [...]
+<span class="n">y</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="s">'y'</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'float32'</span><span class="p">)</span>
+<span class="n">x_bfloat</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'custom[bfloat]16'</span><span class="p">)</span>
+<span class="n">y_bfloat</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'custom[bfloat]16'</span><span class="p">)</span>
 <span class="n">z_bfloat</span> <span class="o">=</span> <span class="n">x_bfloat</span> <span class="o">+</span> <span class="n">y_bfloat</span>
-<span class="n">z</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">cast</span><span class="p">(</span><span class="n">z_bfloat</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'float32'</span><span class="p">)</span>
-<span class="n">program</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">Function</span><span class="p">([</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">],</span> <span class="n">z</span><span class="p">)</span>
+<span class="n">z</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">z_bfloat</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="s">'float32'</span><span class="p">)</span>
+<span class="n">program</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">Function</span><span class="p">([</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">],</span> <span class="n">z</span><span class="p">)</span>
 <span class="k">print</span><span class="p">(</span><span class="n">program</span><span class="p">)</span>
 
 <span class="c1"># v0.0.4
@@ -291,18 +291,18 @@ This registration step
 # }
 </span></code></pre></div></div>
 <p>The program above
-  casts <code class="language-plaintext highlighter-rouge">float32</code> inputs <code class="language-plaintext highlighter-rouge">x</code> and <code class="language-plaintext highlighter-rouge">y</code>
-  into <code class="language-plaintext highlighter-rouge">bfloat</code>s,
+  casts <code class="highlighter-rouge">float32</code> inputs <code class="highlighter-rouge">x</code> and <code class="highlighter-rouge">y</code>
+  into <code class="highlighter-rouge">bfloat</code>s,
   adds them,
-  and casts the result back to <code class="language-plaintext highlighter-rouge">float32</code>.
-Once the <code class="language-plaintext highlighter-rouge">bfloat</code> type is registered,
-  TVM is able to parse the special <code class="language-plaintext highlighter-rouge">dtype</code> syntax
-  <code class="language-plaintext highlighter-rouge">custom[&lt;typename&gt;]</code>,
-  where <code class="language-plaintext highlighter-rouge">&lt;typename&gt;</code> is the name registered for the type.
+  and casts the result back to <code class="highlighter-rouge">float32</code>.
+Once the <code class="highlighter-rouge">bfloat</code> type is registered,
+  TVM is able to parse the special <code class="highlighter-rouge">dtype</code> syntax
+  <code class="highlighter-rouge">custom[&lt;typename&gt;]</code>,
+  where <code class="highlighter-rouge">&lt;typename&gt;</code> is the name registered for the type.
 This syntax also supports the usual
-  <code class="language-plaintext highlighter-rouge">&lt;bits&gt;x&lt;lanes&gt;</code> format;
-  here, we use <code class="language-plaintext highlighter-rouge">16</code> to indicate that
-  each <code class="language-plaintext highlighter-rouge">bfloat</code> is 16 bits wide.
+  <code class="highlighter-rouge">&lt;bits&gt;x&lt;lanes&gt;</code> format;
+  here, we use <code class="highlighter-rouge">16</code> to indicate that
+  each <code class="highlighter-rouge">bfloat</code> is 16 bits wide.
 (The number of lanes
   defaults to 1.)</p>
 
@@ -312,7 +312,7 @@ This syntax also supports the usual
   it cannot yet compile it,
   as TVM does not yet understand 
   how to compile operations 
-  over the <code class="language-plaintext highlighter-rouge">bfloat</code> type.
+  over the <code class="highlighter-rouge">bfloat</code> type.
 To compile these programs,
   we register <em>lowering functions</em> for the custom datatype,
   which help TVM convert the operations
@@ -335,22 +335,22 @@ Figure 1: The expected result of a user's registered lowering function. A loweri
 
 <p>Figure 1 shows a common pattern.
 Let’s assume we are
-  interested in exploring the <code class="language-plaintext highlighter-rouge">bfloat</code> type,
+  interested in exploring the <code class="highlighter-rouge">bfloat</code> type,
   and have chosen to run some workloads
-  by plugging a <code class="language-plaintext highlighter-rouge">bfloat</code> emulation library (e.g. <a href="https://github.com/biovault/biovault_bfloat16" target="_blank">biovault_bfloat16</a>) into TVM
+  by plugging a <code class="highlighter-rouge">bfloat</code> emulation library (e.g. <a href="https://github.com/biovault/biovault_bfloat16" target="_blank">biovault_bfloat16</a>) into TVM
   via the Bring Your Own Datatypes framework.
 Our workload is a simple program
-  which adds two <code class="language-plaintext highlighter-rouge">bfloat</code> inputs.
+  which adds two <code class="highlighter-rouge">bfloat</code> inputs.
 Native TVM does not understand
-  how to implement <code class="language-plaintext highlighter-rouge">bfloat</code> addition—but it doesn’t need to,
+  how to implement <code class="highlighter-rouge">bfloat</code> addition—but it doesn’t need to,
   as we have a library implementing our datatype!
-The library contains an implementation of <code class="language-plaintext highlighter-rouge">bfloat</code> addition,
+The library contains an implementation of <code class="highlighter-rouge">bfloat</code> addition,
   alongside other operators such as multiplication and square root.
-To implement this <code class="language-plaintext highlighter-rouge">bfloat</code> addition,
+To implement this <code class="highlighter-rouge">bfloat</code> addition,
   we’d just like to call into our library.
 Thus, our Add node should become a Call node,
-  calling out to a function (call it <code class="language-plaintext highlighter-rouge">BFloat16Add</code>) in our library.
-To store the bits of the input <code class="language-plaintext highlighter-rouge">bfloat</code>s
+  calling out to a function (call it <code class="highlighter-rouge">BFloat16Add</code>) in our library.
+To store the bits of the input <code class="highlighter-rouge">bfloat</code>s
   inside a type that TVM understands,
   we use 16-bit unsigned integers.
 The resulting program 
@@ -360,16 +360,16 @@ The resulting program
 
 <p>To achieve the above lowering,
   we register a lowering function
-  for <code class="language-plaintext highlighter-rouge">bfloat</code>:</p>
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">tvm</span><span class="p">.</span><span class="n">datatype</span><span class="p">.</span><span class="n">register_op</span><span class="p">(</span>
-    <span class="n">tvm</span><span class="p">.</span><span class="n">datatype</span><span class="p">.</span><span class="n">create_lower_func</span><span class="p">(</span><span class="s">'BFloat16Add'</span><span class="p">),</span>
+  for <code class="highlighter-rouge">bfloat</code>:</p>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">tvm</span><span class="o">.</span><span class="n">datatype</span><span class="o">.</span><span class="n">register_op</span><span class="p">(</span>
+    <span class="n">tvm</span><span class="o">.</span><span class="n">datatype</span><span class="o">.</span><span class="n">create_lower_func</span><span class="p">(</span><span class="s">'BFloat16Add'</span><span class="p">),</span>
     <span class="s">'Add'</span><span class="p">,</span> <span class="s">'llvm'</span><span class="p">,</span> <span class="s">'bfloat'</span><span class="p">)</span>
 </code></pre></div></div>
 <p>The above code registers
   a lowering function
   for a specific operator (Add),
   compilation target (LLVM),
-  and datatype (<code class="language-plaintext highlighter-rouge">bfloat</code>).
+  and datatype (<code class="highlighter-rouge">bfloat</code>).
 The first argument
   is the lowering function.
 This can be any function
@@ -378,15 +378,15 @@ This can be any function
 In our case,
   we use a helper function
   provided by the Bring Your Own Datatypes framework.
-<code class="language-plaintext highlighter-rouge">tvm.datatype.create_lower_func('BFloat16Add')</code>
+<code class="highlighter-rouge">tvm.datatype.create_lower_func('BFloat16Add')</code>
   creates a lowering function
   for the common pattern described above.
 The resulting function
   converts the arguments of the given node
-  to <code class="language-plaintext highlighter-rouge">uint16_t</code>,
+  to <code class="highlighter-rouge">uint16_t</code>,
   and then converts the node itself
   into a call to the given function name
-  (in this case, <code class="language-plaintext highlighter-rouge">'BFloat16Add'</code>).</p>
+  (in this case, <code class="highlighter-rouge">'BFloat16Add'</code>).</p>
 
 <p>To implement a custom datatype,
   the user will need to register
@@ -427,16 +427,16 @@ The Bring Your Own Datatypes framework
 
 <h2 id="references">References</h2>
 
-<div class="footnotes" role="doc-endnotes">
+<div class="footnotes">
   <ol>
-    <li id="fn:ieee" role="doc-endnote">
-      <p><a href="https://standards.ieee.org/standard/754-2019.html" target="_blank">754-2019 - IEEE Standard for Floating-Point Arithmetic</a> <a href="#fnref:ieee" class="reversefootnote" role="doc-backlink">&#8617;</a></p>
+    <li id="fn:ieee">
+      <p><a href="https://standards.ieee.org/standard/754-2019.html" target="_blank">754-2019 - IEEE Standard for Floating-Point Arithmetic</a> <a href="#fnref:ieee" class="reversefootnote">&#8617;</a></p>
     </li>
-    <li id="fn:jouppi2017datacenter" role="doc-endnote">
-      <p>Jouppi, Norman P., et al. “In-datacenter performance analysis of a tensor processing unit.” Proceedings of the 44th Annual International Symposium on Computer Architecture. 2017. <a href="#fnref:jouppi2017datacenter" class="reversefootnote" role="doc-backlink">&#8617;</a></p>
+    <li id="fn:jouppi2017datacenter">
+      <p>Jouppi, Norman P., et al. “In-datacenter performance analysis of a tensor processing unit.” Proceedings of the 44th Annual International Symposium on Computer Architecture. 2017. <a href="#fnref:jouppi2017datacenter" class="reversefootnote">&#8617;</a></p>
     </li>
-    <li id="fn:tensorflowbfloat" role="doc-endnote">
-      <p><a href="https://cloud.google.com/tpu/docs/bfloat16" target="_blank">Using bfloat16 with TensorFlow models</a> <a href="#fnref:tensorflowbfloat" class="reversefootnote" role="doc-backlink">&#8617;</a></p>
+    <li id="fn:tensorflowbfloat">
+      <p><a href="https://cloud.google.com/tpu/docs/bfloat16" target="_blank">Using bfloat16 with TensorFlow models</a> <a href="#fnref:tensorflowbfloat" class="reversefootnote">&#8617;</a></p>
     </li>
   </ol>
 </div>
diff --git a/2020/06/04/tinyml-how-tvm-is-taming-tiny.html b/2020/06/04/tinyml-how-tvm-is-taming-tiny.html
index f322246..09bca33 100644
--- a/2020/06/04/tinyml-how-tvm-is-taming-tiny.html
+++ b/2020/06/04/tinyml-how-tvm-is-taming-tiny.html
@@ -182,32 +182,32 @@ A standard µTVM setup, where the host communicates with the device via JTAG.</p
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">OPENOCD_SERVER_ADDR</span> <span class="o">=</span> <span class="s">'127.0.0.1'</span>
 <span class="n">OPENOCD_SERVER_PORT</span> <span class="o">=</span> <span class="mi">6666</span>
-<span class="n">TARGET</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">target</span><span class="p">.</span><span class="n">create</span><span class="p">(</span><span class="s">'c -device=micro_dev'</span><span class="p">)</span>
-<span class="n">DEV_CONFIG</span> <span class="o">=</span> <span class="n">stm32f746xx</span><span class="p">.</span><span class="n">default_config</span><span class="p">(</span><span class="n">OPENOCD_SERVER_ADDR</span><span class="p">,</span> <span class="n">OPENOCD_SERVER_PORT</span><span class="p">)</span>
+<span class="n">TARGET</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">target</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s">'c -device=micro_dev'</span><span class="p">)</span>
+<span class="n">DEV_CONFIG</span> <span class="o">=</span> <span class="n">stm32f746xx</span><span class="o">.</span><span class="n">default_config</span><span class="p">(</span><span class="n">OPENOCD_SERVER_ADDR</span><span class="p">,</span> <span class="n">OPENOCD_SERVER_PORT</span><span class="p">)</span>
 
 <span class="n">module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">get_cifar10_cnn</span><span class="p">()</span>
-<span class="k">with</span> <span class="n">micro</span><span class="p">.</span><span class="n">Session</span><span class="p">(</span><span class="n">device_config</span><span class="p">)</span> <span class="k">as</span> <span class="n">sess</span><span class="p">:</span>
-	<span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">target</span><span class="o">=</span><span class="n">TARGET</span><span class="p">,</span> <span cl [...]
-  <span class="n">micro_mod</span> <span class="o">=</span> <span class="n">micro</span><span class="p">.</span><span class="n">create_micro_mod</span><span class="p">(</span><span class="n">c_module</span><span class="p">,</span> <span class="n">DEV_CONFIG</span><span class="p">)</span>
-  <span class="n">graph_mod</span> <span class="o">=</span> <span class="n">graph_runtime</span><span class="p">.</span><span class="n">create</span><span class="p">(</span><span class="n">graph</span><span class="p">,</span> <span class="n">micro_mod</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">tvm</span><span class="p">.</span><span class="n">micro_dev</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
-  <span class="n">graph_mod</span><span class="p">.</span><span class="n">run</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">data_np</span><span class="p">)</span>
-  <span class="n">prediction</span> <span class="o">=</span> <span class="n">CIFAR10_CLASSES</span><span class="p">[</span><span class="n">np</span><span class="p">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">graph_mod</span><span class="p">.</span><span class="n">get_output</span><span class="p">(</span><span class="mi">0</span><span class="p">).</span><span class="n">asnumpy</span><span class="p">())]</span>
-  <span class="k">print</span><span class="p">(</span><span class="s">f'prediction was </span><span class="si">{</span><span class="n">prediction</span><span class="si">}</span><span class="s">'</span><span class="p">)</span>
+<span class="k">with</span> <span class="n">micro</span><span class="o">.</span><span class="n">Session</span><span class="p">(</span><span class="n">device_config</span><span class="p">)</span> <span class="k">as</span> <span class="n">sess</span><span class="p">:</span>
+	<span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">target</span><span class="o">=</span><span class="n">TARGET</span><span class="p">,</span> <span cl [...]
+  <span class="n">micro_mod</span> <span class="o">=</span> <span class="n">micro</span><span class="o">.</span><span class="n">create_micro_mod</span><span class="p">(</span><span class="n">c_module</span><span class="p">,</span> <span class="n">DEV_CONFIG</span><span class="p">)</span>
+  <span class="n">graph_mod</span> <span class="o">=</span> <span class="n">graph_runtime</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">graph</span><span class="p">,</span> <span class="n">micro_mod</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">tvm</span><span class="o">.</span><span class="n">micro_dev</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
+  <span class="n">graph_mod</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">data_np</span><span class="p">)</span>
+  <span class="n">prediction</span> <span class="o">=</span> <span class="n">CIFAR10_CLASSES</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">argmax</span><span class="p">(</span><span class="n">graph_mod</span><span class="o">.</span><span class="n">get_output</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">())]</span>
+  <span class="k">print</span><span class="p">(</span><span class="n">f</span><span class="s">'prediction was {prediction}'</span><span class="p">)</span>
 </code></pre></div></div>
 
-<p>Below are the performance results of MicroTVM, compared with <a href="https://github.com/ARM-software/CMSIS_5/releases/tag/5.6.0">CMSIS-NN version 5.7.0</a> (commit <code class="language-plaintext highlighter-rouge">a65b7c9a</code>), a hand-optimized library of ML kernels.</p>
+<p>Below are the performance results of MicroTVM, compared with <a href="https://github.com/ARM-software/CMSIS_5/releases/tag/5.6.0">CMSIS-NN version 5.7.0</a> (commit <code class="highlighter-rouge">a65b7c9a</code>), a hand-optimized library of ML kernels.</p>
 
 <p style="text-align: center"><img src="/images/microtvm/post-2020-05-28/cifar10-int-8-cnn.png" alt="/images/microtvm/post-2020-05-28/cifar10-int-8-cnn.png" width="60%" /><br /></p>
 
 <p>As we can see, the out-of-the-box performance isn’t great, but this is where <a href="https://dl.acm.org/doi/10.5555/3327144.3327258">AutoTVM</a> comes to the rescue.  We can write a schedule template for our device, do a round of autotuning, then achieve significantly better results.  To plug in our autotuned results, we only need to replace this line:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">t [...]
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">t [...]
 </code></pre></div></div>
 
 <p>with these lines:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">with</span> <span class="n">TARGET</span><span class="p">,</span> <span class="n">autotvm</span><span class="p">.</span><span class="n">apply_history_best</span><span class="p">(</span><span class="n">TUNING_RESULTS_FILE</span><span class="p">):</span>
-  <span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">target</span><span class="o">=</span><span class="n">TARGET</span><span class="p">,</span> <span c [...]
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">with</span> <span class="n">TARGET</span><span class="p">,</span> <span class="n">autotvm</span><span class="o">.</span><span class="n">apply_history_best</span><span class="p">(</span><span class="n">TUNING_RESULTS_FILE</span><span class="p">):</span>
+  <span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">target</span><span class="o">=</span><span class="n">TARGET</span><span class="p">,</span> <span c [...]
 </code></pre></div></div>
 
 <p>And our results now look like this:</p>
@@ -240,7 +240,7 @@ The µTVM Device Memory Layout in RAM</p>
 </span>         <span class="s">'text'</span><span class="p">:</span> <span class="mi">18000</span><span class="p">,</span>
          <span class="s">'rodata'</span><span class="p">:</span> <span class="mi">100</span><span class="p">,</span>
          <span class="s">'data'</span><span class="p">:</span> <span class="mi">100</span><span class="p">,</span>
-         <span class="p">...</span>
+         <span class="o">...</span>
     <span class="p">},</span>
     <span class="s">'word_size'</span><span class="p">:</span> <span class="mi">4</span><span class="p">,</span>                        <span class="c1"># device word size
 </span>    <span class="s">'thumb_mode'</span><span class="p">:</span> <span class="bp">True</span><span class="p">,</span>                    <span class="c1"># whether to use ARM's thumb ISA
@@ -276,16 +276,16 @@ The µTVM Device Memory Layout in RAM</p>
 
 <h2 id="device-sessions">Device Sessions</h2>
 
-<p>Given the networked nature of microcontroller interaction, we slightly deviate from standard TVM code by introducing the concept of <code class="language-plaintext highlighter-rouge">MicroSession</code>.</p>
+<p>Given the networked nature of microcontroller interaction, we slightly deviate from standard TVM code by introducing the concept of <code class="highlighter-rouge">MicroSession</code>.</p>
 
 <p>Every piece of functionality in µTVM relies on having an open session with the target device.  If you’re familiar with TVM, you may have noticed a line of code that deviates from the norm in our first code snippet—-namely, this one:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="p">...</span>
-<span class="k">with</span> <span class="n">micro</span><span class="p">.</span><span class="n">Session</span><span class="p">(</span><span class="n">device_config</span><span class="p">)</span> <span class="k">as</span> <span class="n">sess</span><span class="p">:</span>
-	<span class="p">...</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="o">...</span>
+<span class="k">with</span> <span class="n">micro</span><span class="o">.</span><span class="n">Session</span><span class="p">(</span><span class="n">device_config</span><span class="p">)</span> <span class="k">as</span> <span class="n">sess</span><span class="p">:</span>
+	<span class="o">...</span>
 </code></pre></div></div>
 
-<p>Every line inside this <code class="language-plaintext highlighter-rouge">with</code> block can call functions in µTVM, with the context being the device specified by <code class="language-plaintext highlighter-rouge">device_config</code>.  This line is doing a number of things under the hood, so let’s unpack it.</p>
+<p>Every line inside this <code class="highlighter-rouge">with</code> block can call functions in µTVM, with the context being the device specified by <code class="highlighter-rouge">device_config</code>.  This line is doing a number of things under the hood, so let’s unpack it.</p>
 
 <p>First, it initializes a connection with your device, using whichever communication method you specified (usually OpenOCD).  The µTVM device runtime is then cross-compiled, using whichever cross-compiler you specified.  Finally, space for the compiled binary is allocated by the host, and the binary is loaded onto the device using the opened connection.</p>
 
@@ -295,14 +295,14 @@ The µTVM Device Memory Layout in RAM</p>
 
 <p>One of the core abstractions in TVM is that of a module.  A module stores a set of related functions for a particular device/runtime target.  Given that microcontrollers don’t normally have operating systems, µTVM needs to do a lot of extra work to maintain this high-level abstraction.  To see what’s going on, we’ll trace through the process of creating and loading a µTVM-compatible module.</p>
 
-<p>Suppose we have a <code class="language-plaintext highlighter-rouge">micro.Session</code> open with our device and a TVM schedule that implements 2D convolution.  If we want to load it onto our microcontroller, we need it to emit C code.  To do so, we just need to set the <code class="language-plaintext highlighter-rouge">target</code> in either <code class="language-plaintext highlighter-rouge">tvm.build</code> or <code class="language-plaintext highlighter-rouge">relay.build</code>. [...]
+<p>Suppose we have a <code class="highlighter-rouge">micro.Session</code> open with our device and a TVM schedule that implements 2D convolution.  If we want to load it onto our microcontroller, we need it to emit C code.  To do so, we just need to set the <code class="highlighter-rouge">target</code> in either <code class="highlighter-rouge">tvm.build</code> or <code class="highlighter-rouge">relay.build</code>.  Example:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="p">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">t [...]
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">graph</span><span class="p">,</span> <span class="n">c_module</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">module</span><span class="p">[</span><span class="s">'main'</span><span class="p">],</span> <span class="n">t [...]
 </code></pre></div></div>
 
-<p>By setting the target like so, the build process runs through our C code generation backend.  However, the resulting C module still resides on the host machine.  In order to load it onto the device, we run it through one of the core functions in the µTVM infrastructure: <code class="language-plaintext highlighter-rouge">create_micro_mod</code>.  Example:</p>
+<p>By setting the target like so, the build process runs through our C code generation backend.  However, the resulting C module still resides on the host machine.  In order to load it onto the device, we run it through one of the core functions in the µTVM infrastructure: <code class="highlighter-rouge">create_micro_mod</code>.  Example:</p>
 
-<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">micro_mod</span> <span class="o">=</span> <span class="n">micro</span><span class="p">.</span><span class="n">create_micro_mod</span><span class="p">(</span><span class="n">c_module</span><span class="p">,</span> <span class="n">DEV_CONFIG</span><span class="p">)</span>
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">micro_mod</span> <span class="o">=</span> <span class="n">micro</span><span class="o">.</span><span class="n">create_micro_mod</span><span class="p">(</span><span class="n">c_module</span><span class="p">,</span> <span class="n">DEV_CONFIG</span><span class="p">)</span>
 </code></pre></div></div>
 
 <p>The line above cross-compiles the C source within the module, allocates room for the resulting binary (so it can coexist with the runtime in device memory), then sends each section of the binary to its allocated slot on the device.  Once the module binary is snug in device memory, function pointers within the binary are patched to give the module access to helper functions in the device runtime (e.g., for allocating scratchpads).</p>
@@ -317,12 +317,12 @@ The µTVM Device Memory Layout in RAM</p>
 <p>If we want to call an operator, we first need some tensors as arguments:</p>
 
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">data_np</span><span class="p">,</span> <span class="n">kernel_np</span> <span class="o">=</span> <span class="n">get_conv_inputs</span><span class="p">()</span>
-<span class="n">ctx</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">micro_dev</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
-<span class="n">data</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">data_np</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
-<span class="n">kernel</span> <span class="o">=</span> <span class="n">tvm</span><span class="p">.</span><span class="n">nd</span><span class="p">.</span><span class="n">array</span><span class="p">(</span><span class="n">kernel_np</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
+<span class="n">ctx</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">micro_dev</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+<span class="n">data</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">data_np</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
+<span class="n">kernel</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">kernel_np</span><span class="p">,</span> <span class="n">ctx</span><span class="o">=</span><span class="n">ctx</span><span class="p">)</span>
 </code></pre></div></div>
 
-<p>Based on its data type (e.g., <code class="language-plaintext highlighter-rouge">int8</code>, <code class="language-plaintext highlighter-rouge">float32</code>, etc.) and shape, each tensor’s size in bytes is calculated, and the host allocates a region of memory on the device’s heap.  The tensor’s data is then loaded into the allocated region.</p>
+<p>Based on its data type (e.g., <code class="highlighter-rouge">int8</code>, <code class="highlighter-rouge">float32</code>, etc.) and shape, each tensor’s size in bytes is calculated, and the host allocates a region of memory on the device’s heap.  The tensor’s data is then loaded into the allocated region.</p>
 
 <h2 id="function-calls">Function Calls</h2>
 
@@ -352,13 +352,13 @@ The µTVM Device Memory Layout in RAM</p>
 <span class="p">}</span> <span class="n">UTVMTask</span><span class="p">;</span>
 </code></pre></div></div>
 
-<p>In the strict setting, there is a single global <code class="language-plaintext highlighter-rouge">UTVMTask</code> instance that we, from the host side, write into.  Once we have written to the task, the runtime has everything it needs to execute the function, and we can begin execution at the runtime’s entry point.  The runtime will perform some lightweight initialization, run our operator, then return control to the host.</p>
+<p>In the strict setting, there is a single global <code class="highlighter-rouge">UTVMTask</code> instance that we, from the host side, write into.  Once we have written to the task, the runtime has everything it needs to execute the function, and we can begin execution at the runtime’s entry point.  The runtime will perform some lightweight initialization, run our operator, then return control to the host.</p>
 
 <h3 id="lazy-execution">Lazy Execution</h3>
 
 <p>In practice, executing operators as soon as the user requests to becomes prohibitively expensive, as communication overhead begins to dominate.  We can improve the throughput of our system by delaying evaluation until the user wants the results of the call.</p>
 
-<p>From an implementation standpoint, instead of eagerly serializing argument metadata and <code class="language-plaintext highlighter-rouge">UTVMTask</code> data, we now need to accumulate function call metadata on the host side, before flushing it to the device.  The device runtime also needs a few changes: (1) we must now have a global array of <code class="language-plaintext highlighter-rouge">UTVMTask</code> and (2) we need to loop through and execute each task in order.</p>
+<p>From an implementation standpoint, instead of eagerly serializing argument metadata and <code class="highlighter-rouge">UTVMTask</code> data, we now need to accumulate function call metadata on the host side, before flushing it to the device.  The device runtime also needs a few changes: (1) we must now have a global array of <code class="highlighter-rouge">UTVMTask</code> and (2) we need to loop through and execute each task in order.</p>
 
 <h2 id="autotvm-with-microtvm">AutoTVM with MicroTVM</h2>
 
@@ -397,7 +397,7 @@ Diagram of CIFAR-10 CNN</p>
 
 <h2 id="methodology">Methodology</h2>
 
-<p>In our experiments, we use TVM from HEAD (commit <code class="language-plaintext highlighter-rouge">9fa8341</code>), version 5.7.0 of CMSIS-NN (commit <code class="language-plaintext highlighter-rouge">a65b7c9a</code>), version 1.16.0 of STM32CubeF7, and GCC from Arm’s GNU Tools for Arm Embedded Processors 9-2019-q4-major 9.2.1 toolchain (revision 277599).  The host machine used in our experiments runs Ubuntu Linux 18.04.4 LTS and sports an AMD Ryzen Threadripper 2990WX 32-Core Proces [...]
+<p>In our experiments, we use TVM from HEAD (commit <code class="highlighter-rouge">9fa8341</code>), version 5.7.0 of CMSIS-NN (commit <code class="highlighter-rouge">a65b7c9a</code>), version 1.16.0 of STM32CubeF7, and GCC from Arm’s GNU Tools for Arm Embedded Processors 9-2019-q4-major 9.2.1 toolchain (revision 277599).  The host machine used in our experiments runs Ubuntu Linux 18.04.4 LTS and sports an AMD Ryzen Threadripper 2990WX 32-Core Processor with 62GB of RAM.  All evaluation  [...]
 
 <h3 id="arm-specific-optimizations">Arm-Specific Optimizations</h3>
 
@@ -415,7 +415,7 @@ Diagram from CMSIS-NN paper showing a 2x2 matrix multiplication microkernel</p>
 <p>There are certainly other optimizations we could pull from CMSIS-NN to close the gap even further:</p>
 
 <ul>
-  <li>Batch expansion of <code class="language-plaintext highlighter-rouge">int8</code> weights into <code class="language-plaintext highlighter-rouge">int16</code>, to cut down on duplicate expansion for SIMD</li>
+  <li>Batch expansion of <code class="highlighter-rouge">int8</code> weights into <code class="highlighter-rouge">int16</code>, to cut down on duplicate expansion for SIMD</li>
   <li>Splitting convolution into 3x3 tiles to reduce padding checks</li>
 </ul>
 
@@ -430,10 +430,10 @@ Diagram from CMSIS-NN paper showing a 2x2 matrix multiplication microkernel</p>
 <p><a href="https://github.com/areusch/microtvm-blogpost-eval">https://github.com/areusch/microtvm-blogpost-eval</a></p>
 
 <p style="text-align: center"><img src="/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn.png" alt="/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn.png" width="60%" /><br />
-<code class="language-plaintext highlighter-rouge">int8</code>-quantized CIFAR-10 CNN comparison on an Arm STM32F746NG (re-posted from above)</p>
+<code class="highlighter-rouge">int8</code>-quantized CIFAR-10 CNN comparison on an Arm STM32F746NG (re-posted from above)</p>
 
 <p style="text-align: center"><img src="/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn-x86.png" alt="/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn-x86.png" width="60%" /><br />
-<code class="language-plaintext highlighter-rouge">int8</code>-quantized CIFAR-10 CNN comparison on µTVM’s emulated host device</p>
+<code class="highlighter-rouge">int8</code>-quantized CIFAR-10 CNN comparison on µTVM’s emulated host device</p>
 
 <p>On the Arm STM32-series board, we were able to improve performance by ~2x compared to the initial untuned operators, and we achieved results much closer to CMSIS-NN.  Additionally, we were able to significantly improve performance on the host emulated device.  Though the x86 <strong><em>numbers</em></strong> don’t mean much, they show we can use the same infrastructure (µTVM) to optimize performance on vastly different architectures.</p>
 
diff --git a/2020/07/14/bert-pytorch-tvm.html b/2020/07/14/bert-pytorch-tvm.html
new file mode 100644
index 0000000..be00c9e
--- /dev/null
+++ b/2020/07/14/bert-pytorch-tvm.html
@@ -0,0 +1,719 @@
+
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Bridging PyTorch and TVM</title>
+    
+    <meta name="author" content="">
+
+    <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
+    <!--[if lt IE 9]>
+      <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
+    <![endif]-->
+
+    <!-- Le styles -->
+    <link href="/assets/themes/custom-twitter/css/1.4.0/bootstrap.css" rel="stylesheet">
+    <link href="/assets/themes/custom-twitter/css/style.css?body=1" rel="stylesheet" type="text/css" media="all">
+
+    <!-- Le fav and touch icons -->
+  <!-- Update these with your own images
+    <link rel="shortcut icon" href="images/logo/tvm-logo.png">
+  <link rel="shortcut icon" href="images/logo/tvm-logo.png">
+  -->
+  <link href="/images/logo/tvm-logo-square.png" rel="icon" type="image/png"/>
+  <!-- Global site tag (gtag.js) - Google Analytics -->
+  <script async src="https://www.googletagmanager.com/gtag/js?id=UA-75982049-2"></script>
+  <script>
+    window.dataLayer = window.dataLayer || [];
+    function gtag(){dataLayer.push(arguments);}
+
+    gtag('js', new Date());
+    gtag('config', 'UA-75982049-2');
+  </script>
+
+</head>
+
+  <body>
+    <div class="topbar">
+      <div class="fill">
+        <div class="container">
+          <h2 id="logo-wrap">
+            <a href="/" class="nav">
+              <img src="/images/logo/tvm-logo-small-black.png" width="100px">
+            </a>
+          </h2>
+          <ul class="nav" id="nav-bar">
+            
+            
+            
+
+
+
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      	
+      	<li><a href="/community">Community</a></li>
+      	
+      
+      
+    
+  
+    
+      
+      	
+      	<li><a href="/download">Download</a></li>
+      	
+      
+      
+    
+  
+    
+      
+      	
+      	<li><a href="/about">About</a></li>
+      	
+      
+      
+    
+  
+    
+      
+      
+    
+  
+    
+      
+      	
+      	<li><a href="/vta">VTA</a></li>
+      	
+      
+      
+    
+  
+    
+      
+      
+      	
+      	<li><a href="/blog">Blog</a></li>
+      	
+      
+    
+  
+
+
+
+
+            <li> <a href="https://tvm.apache.org/docs">Docs</a></li>
+            <li> <a href="https://tvmconf.org">TVM Conference</a></li>
+            <li> <a href="https://github.com/apache/incubator-tvm/">Github</a></li>
+            <li> <a href="/asf">ASF</a></li>
+          </ul>
+        </div>
+      </div>
+    </div>
+    
+<div class="container">
+<div class="content">
+  <div class="row">
+    <div class="span14">
+      <h1>Bridging PyTorch and TVM </h1>
+      <p class="post-meta">
+        <time datetime="2020-07-14T00:00:00-07:00" itemprop="datePublished">
+          Jul 14, 2020
+        </time>
+        
+        • <span itemprop="author" itemscope itemtype="http://schema.org/Person">
+          <span itemprop="name">Thomas Viehmann, MathInf GmbH</span>
+        </span>
+        
+      </p>
+      <p class="post-meta">
+        </p>
+    </br>
+    
+<p>(A more code-heavy variant is crossposted on the more PyTorch affine <a href="https://lernapparat.de/transformers-pytorch-tvm/">Lernapparat</a>,
+ the Jupyter Notebook to follow along is on <a href="https://github.com/t-vi/pytorch-tvmisc/tree/master/transformers-pytorch-tvm/">github</a>.)</p>
+
+<p>Some of the most intriguing applications of Artificial Intelligence have been in Natural Language Processing.
+Models like BERT or GPT-2 and their variants can seemingly grasp enough of a text to continue it in a way that needs a second look to recognize as gibberish.</p>
+
+<p>These models belong to a class of neural network architectures called <em>Transformers</em>. One of the favourite libraries
+implementing them is the <a href="https://github.com/huggingface/transformers/">HuggingFace transformers library</a>.</p>
+
+<p>But, in contrast to convolutional models or LSTMs where we have heavily optimized implementations, this is not as much the case for transformers.
+So here we explore how TVM can fill the gap. We will do so in two steps:</p>
+
+<ul>
+  <li>First we look at BERT inference and tuning that on TVM.</li>
+  <li>Secondly, we start some more fundamental exploration of how one could use TVM for training in PyTorch.
+Given the experimental nature, we focus on feasibility more than on the performance in this part.</li>
+</ul>
+
+<h1 id="optimizing-bert-inference-with-tvm">Optimizing BERT Inference with TVM</h1>
+
+<p>So how do we get BERT from the transformer library to TVM?</p>
+
+<p>Helpfully, transformers supports tracing their model with the PyTorch JIT. We use their <a href="https://huggingface.co/transformers/torchscript.html">tutorial on it</a>,
+specifically the part until we have a traced model.</p>
+
+<p>The PyTorch traced model takes around 0.65-0.7 seconds for 100 runs on my AMD Radeon VII with the example inputs, which means 6.5-7ms per run.
+We can try to see if we can use TVM get faster. Let converting our model to TVM is a breeze:</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">shape_list</span> <span class="o">=</span> <span class="p">[(</span><span class="n">i</span><span class="o">.</span><span class="n">debugName</span><span class="p">()</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">'.'</span><span class="p">)[</span><span class="mi">0</span><span class="p">],</span> <span class="n">i</span><s [...]
+
+<span class="n">mod_bert</span><span class="p">,</span> <span class="n">params_bert</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">pytorch</span><span class="o">.</span><span class="n">from_pytorch</span><span class="p">(</span><span class="n">traced_model</span><span class="p">,</span>
+                        <span class="n">shape_list</span><span class="p">,</span> <span class="n">default_dtype</span><span class="o">=</span><span class="s">"float32"</span><span class="p">)</span>
+</code></pre></div></div>
+
+<p>There will be a few warnings about not finding dtype information, but it goes well!
+We can now build and run it. Building follows the standard TVM recipe. We also convert the PyTorch (cpu) tensors to TVM arrays.</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">target</span> <span class="o">=</span> <span class="s">'rocm -model=gfx906'</span>  <span class="c1"># use what matches your GPU
+</span>
+<span class="n">target_host</span> <span class="o">=</span> <span class="s">'llvm'</span>
+<span class="n">ctx</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">context</span><span class="p">(</span><span class="n">target</span><span class="p">)</span>
+
+<span class="n">tt_a</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">tokens_tensor</span><span class="o">.</span><span class="n">numpy</span><span class="p">(),</span> <span class="n">ctx</span><span class="p">)</span>
+<span class="n">st_a</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">segments_tensors</span><span class="o">.</span><span class="n">numpy</span><span class="p">(),</span> <span class="n">ctx</span><span class="p">)</span>
+</code></pre></div></div>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">backend</span><span class="o">.</span><span class="n">compile_engine</span><span class="o">.</span><span class="n">get</span><span class="p">()</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span> <span class="c1"># just to be sure, see https:/ [...]
+</span>
+<span class="k">with</span> <span class="n">tvm</span><span class="o">.</span><span class="n">transform</span><span class="o">.</span><span class="n">PassContext</span><span class="p">(</span><span class="n">opt_level</span><span class="o">=</span><span class="mi">3</span><span class="p">):</span>
+        <span class="n">graph</span><span class="p">,</span> <span class="n">lib</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">build</span><span class="p">(</span><span class="n">mod_bert</span><span class="p">,</span>
+                                     <span class="n">target</span><span class="o">=</span><span class="n">target</span><span class="p">,</span>
+                                     <span class="n">target_host</span><span class="o">=</span><span class="n">target_host</span><span class="p">,</span>
+                                     <span class="n">params</span><span class="o">=</span><span class="n">params_bert</span><span class="p">)</span>
+<span class="n">module</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">contrib</span><span class="o">.</span><span class="n">graph_runtime</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">graph</span><span class="p">,</span> <span class="n">lib</span><span class="p">,</span> <span class="n">ctx</span><span class="p">)</span>
+</code></pre></div></div>
+
+<p>This will warn us a few times times:</p>
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    WARNING:autotvm:Cannot find config for ... batch_matmul.cuda .... A fallback configuration is used, which may bring great performance regression.
+</code></pre></div></div>
+
+<p>Uh oh, <em>may bring great performance regression</em>. Let us see.</p>
+
+<p>But first we run the model and see if the outputs match:</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    <span class="p">(</span><span class="mf">8.583069e-06</span><span class="p">,</span> <span class="mf">8.493662e-07</span><span class="p">)</span>
+</code></pre></div></div>
+
+<p>Looks good. Remember that we’re computing in float32, so $10^{-6}$ish is a good result.</p>
+
+<p>After building our model and setting the parameters, we time our model like this:</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">def</span> <span class="nf">x</span><span class="p">():</span>
+    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">):</span>
+        <span class="n">module</span><span class="o">.</span><span class="n">run</span><span class="p">()</span>
+    <span class="n">ctx</span><span class="o">.</span><span class="n">sync</span><span class="p">()</span>
+<span class="n">x</span><span class="p">()</span>
+<span class="o">%</span><span class="n">timeit</span> <span class="n">x</span><span class="p">()</span>
+</code></pre></div></div>
+
+<p>Ouch, it takes 6.65s per 100 runs, or 67ms per run of the model. That’s slow indeed. But the warning said that is was because it could not find (tuned) configurations. Let us then tune the tasks.</p>
+
+<p>Tuning does take half a day or so (I’m basically following the TVM tuning tutorial for ResNet tuning with autotvm.)</p>
+
+<p>After this, we can again build the model, this time with the new configuration. This time we should see no comments about missing configurations.
+Now it’s in the region of 6.5-7ms per run, similar to PyTorch. This is what we get from this very elementary optimization of our operators. We can push it a little further, though.</p>
+
+<p>To see how, let us dive deep into BERT modeling and TVM.</p>
+
+<p>If you don’t want to get the full details, do skip the next section and scroll down to <em>Results</em>. I should add that I would hope that this tuning part of the tutorial will obsolete itself in the sense that in some near future, you will get much better speed right out of the box or at least after some initial tuning. So if you don’t see a speedup between here and <em>Results</em>, that’s because I did my homework in submitting patches.</p>
+
+<h2 id="the-bert-model">The BERT model</h2>
+
+<p>Let us take a closer look at what’s going on in BERT.</p>
+
+<p>Like many deep learning models, BERT comes with a bit some prologue (vocabulary embeddings) and epilogue (pooling) and the bulk is organized into similar-looking blocks, here we have 12 <code class="highlighter-rouge">BertLayer</code> modules.
+The <code class="highlighter-rouge">attention_mask</code> is jsut to prevent BERT from looking at the answer when dealing with the question.</p>
+
+<p><img src="/images/bert-pytorch/bert_model.svg" alt="Bert Model" /></p>
+
+<p>So let us zoom in and look at a BertLayer in detail, since that ultimately is what we need make fast.
+As we see in the net diagram, the main part of the <code class="highlighter-rouge">BertLayer</code> module is a submodule <code class="highlighter-rouge">BertSelfAttention</code>.</p>
+
+<p><img src="/images/bert-pytorch/bert_layer.svg" alt="BertLayer" /></p>
+
+<p>Now the <code class="highlighter-rouge">BertSelfAttention</code> captures the famed self-attention mechanism that is the hallmark of transformer models. (I cannot recommend Sascha Rush’s <a href="http://nlp.seas.harvard.edu/2018/04/03/attention.html">Annotated Transformer</a> enough as a detailed walkthrough.)</p>
+
+<h2 id="putting-the-bertlayer-under-the-microscope">Putting the BertLayer under the Microscope</h2>
+
+<p>If we want go into details, we should want to run a BertLayer individually.
+We grab the inputs of a BertLayer (see the Notebook for how) and convert a single <code class="highlighter-rouge">BertLayer</code> to TVM as we did for the entire model.</p>
+
+<p>To look at the TVM module, we define a little visualization helper (loosely based on TVM <a href="https://github.com/apache/incubator-tvm/pull/4370">PR#4370</a>).</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kn">import</span> <span class="nn">graphviz</span>
+<span class="k">def</span> <span class="nf">visualize</span><span class="p">(</span><span class="n">expr</span><span class="p">,</span> <span class="n">collapse_small</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">node_attr_dict</span> <span class="o">=</span> <span class="p">{}):</span>
+    <span class="k">def</span> <span class="nf">collect_ops</span><span class="p">(</span><span class="n">node</span><span class="p">):</span>
+        <span class="n">ops</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
+        <span class="k">def</span> <span class="nf">visitor</span><span class="p">(</span><span class="n">e</span><span class="p">):</span>
+            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">e</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">ir</span><span class="o">.</span><span class="n">Op</span><span class="p">):</span>
+                <span class="n">ops</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">e</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
+        <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">analysis</span><span class="o">.</span><span class="n">post_order_visit</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">visitor</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">ops</span>
+
+    <span class="c1"># node_dict maps a Relay node to an index (node ID)
+</span>    <span class="k">def</span> <span class="nf">_traverse_expr</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">node_dict</span><span class="p">):</span>
+        <span class="k">if</span> <span class="n">node</span> <span class="ow">in</span> <span class="n">node_dict</span><span class="p">:</span>
+            <span class="k">return</span>
+        <span class="n">node_dict</span><span class="p">[</span><span class="n">node</span><span class="p">]</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">node_dict</span><span class="p">)</span>
+
+    <span class="n">node_dict</span> <span class="o">=</span> <span class="p">{}</span>
+    <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">analysis</span><span class="o">.</span><span class="n">post_order_visit</span><span class="p">(</span><span class="n">expr</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">_traverse_expr</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">node_dict</ [...]
+
+    <span class="n">relayviz_nodes</span> <span class="o">=</span> <span class="p">[]</span>
+
+    <span class="n">dot</span> <span class="o">=</span> <span class="n">graphviz</span><span class="o">.</span><span class="n">Digraph</span><span class="p">(</span><span class="nb">format</span><span class="o">=</span><span class="s">'svg'</span><span class="p">,</span> <span class="p">)</span>
+    <span class="n">dot</span><span class="o">.</span><span class="n">attr</span><span class="p">(</span><span class="s">'node'</span><span class="p">,</span> <span class="n">shape</span> <span class="o">=</span> <span class="s">'box'</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">to_str</span><span class="p">(</span><span class="n">node</span><span class="p">):</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Constant</span><span class="p">):</span>
+            <span class="k">return</span> <span class="nb">repr</span><span class="p">(</span><span class="n">node</span><span class="p">)</span><span class="o">.</span><span class="n">lstrip</span><span class="p">(</span><span class="s">'Constant('</span><span class="p">)[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="nb">NotImplementedError</span><span class="p">(</span><span class="s">"to_str:"</span> <span class="o">+</span> <span class="nb">repr</span><span class="p">(</span><span class="n">node</span><span class="p">))</span>
+
+    <span class="k">def</span> <span class="nf">is_small_const</span><span class="p">(</span><span class="n">c</span><span class="p">):</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="n">collapse_small</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Constant</span><span class="p">)):</span>
+            <span class="k">return</span> <span class="bp">False</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">runtime</span><span class="o">.</span><span class="n">ndarray</span><span class="o">.</span><span class="n">NDArray</span><span class="p">):</span>
+            <span class="k">return</span> <span class="n">numpy</span><span class="o">.</span><span class="n">prod</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mi">10</span>
+        <span class="k">return</span> <span class="bp">True</span>
+            
+    <span class="c1"># Sort by node ID
+</span>    <span class="k">for</span> <span class="n">node</span><span class="p">,</span> <span class="n">node_id</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">node_dict</span><span class="o">.</span><span class="n">items</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><s [...]
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Function</span><span class="p">):</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="s">'Function'</span><span class="p">,</span> <span class="o">**</span><span class="n">node_attr_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="p">{}))</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">edge</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_dict</span><span class="p">[</span><span class="n">node</span><span class="o">.</span><span class="n">body</span><span class="p">]),</span> <span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">))</span>
+        <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Var</span><span class="p">):</span>
+            <span class="k">if</span> <span class="n">node</span><span class="o">.</span><span class="n">type_annotation</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">None</span><span class="p">:</span>
+                <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">node</span><span class="o">.</span><span class="n">type_annotation</span><span class="p">,</span> <span class="s">'shape'</span><span class="p">):</span>
+                    <span class="n">shape</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">int</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">node</span><span class="o">.</span><span class="n">type_annotation</span><span class="o">.</span><span class="n">shape</span><span class="p">])</span>
+                    <span class="n">dtype</span> <span class="o">=</span> <span class="n">node</span><span class="o">.</span><span class="n">type_annotation</span><span class="o">.</span><span class="n">dtype</span>
+                    <span class="n">typstr</span> <span class="o">=</span> <span class="s">'Tensor[{}, {}]'</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">dtype</span><span class="p">)</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">typstr</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">node</span><span class="o">.</span><span class="n">type_annotation</span><span class="p">)</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="n">typstr</span> <span class="o">=</span> <span class="s">'?'</span>
+            <span class="n">d</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="n">shape</span> <span class="o">=</span> <span class="s">'ellipse'</span><span class="p">)</span>
+            <span class="n">d</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">node_attr_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="p">{}))</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span>
+                     <span class="s">'{}: {}'</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span>
+                         <span class="n">node</span><span class="o">.</span><span class="n">name_hint</span><span class="p">,</span> <span class="n">typstr</span>
+                     <span class="p">),</span> <span class="o">**</span><span class="n">d</span><span class="p">)</span>
+        <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Tuple</span><span class="p">):</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="s">'Tuple[...])'</span><span class="p">,</span> <span class="o">**</span><span class="n">node_attr_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="p">{}))</span>
+            <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">node</span><span class="o">.</span><span class="n">fields</span><span class="p">:</span>
+                <span class="n">dot</span><span class="o">.</span><span class="n">edge</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_dict</span><span class="p">[</span><span class="n">field</span><span class="p">]),</span> <span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">))</span>
+        <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Constant</span><span class="p">):</span>
+            
+            <span class="k">if</span> <span class="ow">not</span> <span class="n">is_small_const</span><span class="p">(</span><span class="n">node</span><span class="p">):</span> <span class="c1"># small consts are shown in ops
+</span>                <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="s">'Constant({}, {})'</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">node</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class= [...]
+                        <span class="o">**</span><span class="n">node_attr_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="p">{}))</span>
+        <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Call</span><span class="p">):</span>
+            <span class="n">args_with_edge</span> <span class="o">=</span> <span class="p">[]</span>
+            <span class="n">arg_str_list</span> <span class="o">=</span> <span class="p">[]</span>
+            <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">node</span><span class="o">.</span><span class="n">args</span><span class="p">:</span>
+                <span class="k">if</span> <span class="n">is_small_const</span><span class="p">(</span><span class="n">arg</span><span class="p">):</span>
+                    <span class="n">arg_str_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">to_str</span><span class="p">(</span><span class="n">arg</span><span class="p">))</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">arg_str_list</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s">'·'</span><span class="p">)</span>
+                    <span class="n">args_with_edge</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span>
+            <span class="n">arg_str</span> <span class="o">=</span> <span class="s">', '</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">arg_str_list</span><span class="p">)</span>
+            <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="o">.</span><span class="n">op</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">ir</span><span class="o">.</span><span class="n">Op</span><span class="p">):</span>
+                <span class="n">name</span> <span class="o">=</span> <span class="n">node</span><span class="o">.</span><span class="n">op</span><span class="o">.</span><span class="n">name</span>
+                <span class="n">attrs</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span><span class="nb">getattr</span><span class="p">(</span><span class="n">node</span><span class="o">.</span><span class="n">attrs</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">node</span><span class="o">.</span><span class= [...]
+                <span class="c1">#attrs = inspect.getmembers(node.attrs)
+</span>                <span class="n">attr_str_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">k</span><span class="o">+</span><span class="s">'='</span><span class="o">+</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">v</span><span cla [...]
+                <span class="k">if</span> <span class="n">attr_str_list</span><span class="p">:</span>
+                    <span class="n">attr_str</span> <span class="o">=</span> <span class="s">'| '</span><span class="o">+</span> <span class="s">', '</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">attr_str_list</span><span class="p">)</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">attr_str</span> <span class="o">=</span> <span class="s">''</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="n">ops</span> <span class="o">=</span> <span class="n">collect_ops</span><span class="p">(</span><span class="n">node</span><span class="p">)</span>
+                <span class="k">if</span> <span class="n">ops</span><span class="p">:</span>
+                    <span class="n">name</span> <span class="o">=</span> <span class="s">'_'</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ops</span><span class="p">)</span>
+                <span class="k">else</span><span class="p">:</span>
+                    <span class="n">name</span> <span class="o">=</span> <span class="s">'...'</span>
+                <span class="n">attr_str</span> <span class="o">=</span> <span class="s">''</span>
+            <span class="n">s</span> <span class="o">=</span> <span class="n">f</span><span class="s">'{name}({arg_str}{attr_str})'</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="n">s</span><span class="p">,</span> <span class="o">**</span><span class="n">node_attr_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="p">{}))</span>
+            <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args_with_edge</span><span class="p">:</span>
+                <span class="n">dot</span><span class="o">.</span><span class="n">edge</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_dict</span><span class="p">[</span><span class="n">arg</span><span class="p">]),</span> <span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">))</span>
+        <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">ir</span><span class="o">.</span><span class="n">Op</span><span class="p">):</span>
+            <span class="c1"># dot.node(str(node_id), 'Op {}'.format(node.name))
+</span>            <span class="k">pass</span> <span class="c1"># covered in call
+</span>        <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">TupleGetItem</span><span class="p">):</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="s">'TupleGetItem(idx={})'</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">node</span><span class="o">.</span><span class="n">index</span><span class="p">),</span> <span class="o">**</span><span class="n">nod [...]
+            <span class="n">dot</span><span class="o">.</span><span class="n">edge</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_dict</span><span class="p">[</span><span class="n">node</span><span class="o">.</span><span class="n">tuple_value</span><span class="p">]),</span> <span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">))</span>
+        <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Let</span><span class="p">):</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="s">'Let(XX)'</span><span class="p">,</span> <span class="o">**</span><span class="n">node_attr_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="p">{}))</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">edge</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_dict</span><span class="p">[</span><span class="n">node</span><span class="o">.</span><span class="n">value</span><span class="p">]),</span> <span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">))</span>
+            <span class="n">dot</span><span class="o">.</span><span class="n">edge</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="nb">str</span><span class="p">(</span><span class="n">node_dict</span><span class="p">[</span><span class="n">node</span><span class="o">.</span><span class="n">var</span><span class="p">]))</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="nb">RuntimeError</span><span class="p">(</span>
+                <span class="s">'Unknown node type. node_id: {}, node: {}'</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">node_id</span><span class="p">,</span> <span class="nb">type</span><span class="p">(</span><span class="n">node</span><span class="p">)))</span>
+
+    <span class="k">return</span> <span class="n">dot</span>
+
+</code></pre></div></div>
+
+<p>Let’s run that on our main function. For some reason (well, to be fully general, probably) the PyTorch converter will convert <code class="highlighter-rouge">Linear</code> layers to <code class="highlighter-rouge">batch_matmul</code> rather than just <code class="highlighter-rouge">dense</code>. We’ll get back to this in a bit. As TVM’s <code class="highlighter-rouge">batch_matmul</code> has the contraction axis last on both operands (unlike PyTorch), there are quite a few transpose o [...]
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">visualize</span><span class="p">(</span><span class="n">mod</span><span class="p">[</span><span class="s">'main'</span><span class="p">])</span>
+</code></pre></div></div>
+
+<p><img src="/images/bert-pytorch/bert-tvm_49_0.svg" alt="svg" /></p>
+
+<p>In addition to our named inputs, we see a number of unnamed (numbered) variables. These are the neural network parameters.</p>
+
+<p>Let us compile our model.</p>
+
+<p>Just like the full model, we can run and time our submodule after checking that it computes the same quantities.</p>
+
+<p>100 runs take 20.2ms. The back of the envelope calculation here is that with <code class="highlighter-rouge">BertLayer</code> in PyTorch we are spending about 0.2ms in this layer, so about 2.4ms on 12 layers - a not the majority but a sizeable part of the 6-7ms overall runtime. Let’s compare to TVM. (A good rule is to never optimize without measuring.)</p>
+
+<p>Similarly, TVM clocks in at 18.2ms for 100 runs. So here we are again roughly on par with PyTorch.</p>
+
+<p>One thing we see from the picture is that the input is reshaped three times. There is a TVM optimization pass call Common Subexpression Elimination (CSE) that combines the three reshapes.
+(A while ago, this did not succeed because it had distinct shape arguments, but this was since solved by the TVM developers in the dynamic to static conversion pass.)
+Also, the model parameters that are reshaped and transposed. Can we get rid of that, too? 
+Yes. And for that we would first <em>bind</em> the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes. 
+With the <code class="highlighter-rouge">Foldconstant</code> pass, we can propagate the constants through the <code class="highlighter-rouge">transpose</code>s and <code class="highlighter-rouge">reshape</code>s to move them closer to the matmuls.</p>
+
+<p>After these three (which TVM will do when we compile a relay model), our model looks like this:</p>
+
+<p><img src="/images/bert-pytorch/bert-tvm_72_0.svg" alt="svg" /></p>
+
+<p>And now comes an interesting trick. It is more efficient to merge the three batch matmuls with the same input into a single <code class="highlighter-rouge">batch_matmul</code>. We implemented a pass doing this in <a href="https://github.com/apache/incubator-tvm/pull/5791">TVM PR 5791</a>. So let’s call it and also have another constant-folding pass.</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">new_mod</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">transform</span><span class="o">.</span><span class="n">CombineParallelBatchMatmul</span><span class="p">()(</span><span class="n">new_mod</span><span class="p">)</span>
+<span class="n">new_mod</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">transform</span><span class="o">.</span><span class="n">FoldConstant</span><span class="p">()(</span><span class="n">new_mod</span><span class="p">)</span>
+<span class="n">visualize</span><span class="p">(</span><span class="n">new_mod</span><span class="p">[</span><span class="s">"main"</span><span class="p">])</span>
+</code></pre></div></div>
+
+<p><img src="/images/bert-pytorch/bert-tvm_74_0.svg" alt="svg" /></p>
+
+<p>Awesome. After checking that we still get the same result.
+We can time again: 25.2 ms for 100 runs. It’s a bit slow again because we need to tune for the new shapes.
+After tuning, we are at 12.6ms for 100 runs, so we went from about 0.2ms to about 0.13-0.15ms, a nice speedup.
+By our handwavy calculation, this should cut 0.6-0.8ms from the total runtime, or somewhere between 5%-10%. Let’s check.</p>
+
+<h2 id="results-on-the-overall-bert-model-after-optimization">Results on the overall BERT model after optimization</h2>
+
+<p>Let’s define a function combining the optimization passes from above and run it on the entire BERT model.
+We go through the same exercise as above.</p>
+
+<p>We get to 624ms for 100 runs. So yay, we went from 6.5-7ms in PyTorch to ~6.2ms in TVM. This is a 5%-10% speedup. Note that we have only taking a particular, not very large shape. A more serious analysis would consider more problem shapes.</p>
+
+<p>We could probably take it a bit further yet - e.g. fusing the additions after the batch matmul by handling the reshape, but we’ll leave it at this for now. Also we will benefit from further improvements to TVM, so it will be interesting to see how the benchmark improves over time. In particular, the upcoming Ansor tuning mechanism seems promising.</p>
+
+<h2 id="a-peek-under-the-hood">A peek under the hood</h2>
+
+<h3 id="comparing-implementation-of-models">Comparing implementation of models</h3>
+
+<p>As you can see, I have always compared PyTorch with TVM outputs to see if they’re good.
+Also, when I investigated some inner layer, I grabbed the inputs to that to convert and feed into the TVM model. I do believe that this is a very effective technique.</p>
+
+<p>Sometimes, however, it is difficult to assess whether a deviation between the results is from numerical accuracy or from an error somewhere.
+When I initially converted the model, the the <code class="highlighter-rouge">SelfAttention</code> submodule output was replicated by the TVM model to about 1e-6.
+However, the BertLayer conversion had something like 1-e3. I was not entirely clear whether that might be due to accumulated numerical errors or some material deviation somewhere.
+(This turned out to be the GELU activation, which was converted to FastGELU.)</p>
+
+<p>One of the things I like to do in this case is jump to double precision and check there. Numerical errors should get much smaller, while other deviations would remain of the same order.
+With the PyTorch frontend, you can trace the model converted to float64 on the PyTorch side if you pass <code class="highlighter-rouge">default_dtype="float64"</code> to the conversion function.</p>
+
+<p>Running the module and comparing to PyTorch should now have 1e-14 or so deviation.</p>
+
+<h3 id="improvements-in-tvm-to-facilitate-this-usecase">Improvements in TVM to facilitate this usecase</h3>
+
+<p>Before this worked as shown here, we had to close some gaps (but a recent git checkout will include all of them):</p>
+<ul>
+  <li>The TVM PyTorch converter did not support inputs other than fp32. We <a href="https://github.com/t-vi/tvm/tree/pytorch_frontend_type_fix">implemented improved conversion</a>, now also included in TVM upsteam.</li>
+  <li>The TVM schedule, i.e. the organization of the computation, of the workhorse operation, batch_matmul, was fixed and it was very slow (similar to running without a tuned schedule now). So we <a href="https://github.com/apache/incubator-tvm/pull/5752">implemented a tuneable schedule</a>.</li>
+  <li>The PyTorch converter produces batch matmul operations (it could probably also be changed to produce dense layers instead). But as we saw, one of the larger speed advantages is to combine Query Key and Value linear layers, so we implemented <a href="https://github.com/apache/incubator-tvm/pull/5791">fusing batch matmul operations</a>.</li>
+  <li>When comparing the computation results, we noticed that the <a href="https://pytorch.org/docs/master/generated/torch.nn.GELU.html">GELU</a> function was converted to its FastGELU variant. We fixed that. (There is a <em>fast math</em> optimization pass in TVM that does some replacement of the error function, though we didn’t check if it yields FastGELU for the GELU expressed with the error function.)</li>
+  <li>TVM was initially (and still is to a some extent) focussed on static shapes. Recently it experiments with dynamic operations. The dynamic reshape - taking an argument for the target shape - is an early of these experiments, but as seen above, it prevented the fusion of batch matmuls because the common subexpression elimination pass didn’t detect that it could merge the identical input reshaping. This has improved recently.</li>
+</ul>
+
+<h1 id="training-pytorch-models-with-tvm-computation">Training Pytorch models with TVM computation</h1>
+
+<p>In this second part we want see if we could use TVM while training BERT in PyTorch.
+Of course, this opens an entire new can of worms as we need to deal with autodifferentiation.
+While we stay with the theme from above and take <code class="highlighter-rouge">BertLayer</code> as the example, our methodology is representative of non-trivial modules in general.
+We will want to divert the computation during training to TVM.</p>
+
+<p>So the user can take a (traceable) module and do</p>
+<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>add_tvm_dispatch(module, sample_input)
+</code></pre></div></div>
+<p>and then if she calls module with inputs of the same shape as the sample_input, she’ll get the outputs computed by TVM (as PyTorch tensors, of course) and if not, it’ll just use the regular forward.</p>
+
+<p>The but so we already hinted at the bad news: In this part we will see how to do these things. We will not yet achieve a great speedup.</p>
+
+<p>But enough talk, let us dive right in!
+Again, we get our relay model with running a traced <code class="highlighter-rouge">BertLayer</code> from the transformer <code class="highlighter-rouge">Bert</code> model through <code class="highlighter-rouge">tvm.relay.frontend.from_pytorch</code>.</p>
+
+<p>One thing we’ll do in between is to move from a modular interface in PyTorch - with named parameters - to a functional
+interface (which is what TVM can do for us). The first thing we want to do for that is arrange for the function arguments to be in an order that we can work with - i.e. first the direct inputs to the module and then the parameters in the same order that PyTorch uses them. After this operation, our <code class="highlighter-rouge">BertLayer </code> in TVM looks like this:</p>
+
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_20_0.svg" alt="svg" /></p>
+
+<p>As in the BERT inference, we want to run some optimization passes.</p>
+
+<p>But we also have a few new transformations:</p>
+
+<ul>
+  <li>One particularity of the Autodifferentiation is that it’ll use a lot of <code class="highlighter-rouge">..._like</code> operations to broadcast or “unbroadcast” (summation is the dual of broadcasting w.r.t. autodifferentiation) things. But this means that you now have two tensor arguments, even if the latter doesn’t really need a gradient. <code class="highlighter-rouge">ZappLike</code> replaces those operations with the corresponding functions taking a shape parameter instead.</li>
+  <li>Another thing is the “rooting” of derivatives. TVM generates a tensors with all ones of the same shape as the return values of our function as the starting point for the chain rule. These are then multiplied to the derivatives of our operations. But multiplication with ones is not doing much, so we strike that. Similarly, TVM initializes the gradient of a variable (an input) to zeros of the same shape. If it isn’t used, the gradient will be zero, but if it is, the “real gradient” w [...]
+  <li>TVM doesn’t have a training variant for the <code class="highlighter-rouge">LayerNorm</code> (or <code class="highlighter-rouge">BatchNorm</code> or others). So we implement a pass to spell out the computation.</li>
+  <li>TVM also doesn’t have training dropout. Here the problem is somewhat harder to fix, as TVM doesn’t have random currently. We instead replace the dropout by a construct taking a random bernoulli draw (of 0/1 values) and mimicking dropout with that. The idea is that we’ll use PyTorch to generate this mask for us. This has the added benefit that (if we generate dropout masks in the same order as PyTorch) we’ll get the exact same result.</li>
+</ul>
+
+<p>As hinted at above, TVM’s gradient taking assumes that it is the last element in the computation (the ones-Tensors discussed above). This isn’t a good fit with PyTorch’s modular view which expects a <code class="highlighter-rouge">grad_out</code> for each output to be given. Happily, this is computationally equivalent to multiplying by grad out and summation, so we amend our function with that. We wish to be flexible, so we allow both functions returning a single tensor and those retu [...]
+
+<p>With these modificaitons applied, our model looks like this:</p>
+
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_25_0.svg" alt="svg" /></p>
+
+<p>Finally we can take the grad. As we get a lot of <code class="highlighter-rouge">let</code> nodes, we bring it to normal form using the <code class="highlighter-rouge">ToGraphNormalForm</code> pass.
+TVM’s gradient-taking returns a function that has the same parameters as the original function (in our case amended with the <code class="highlighter-rouge">grad_out</code> and dropout) and then returns a tuple of the original return and a tuple containing gradients for all inputs.
+The first thing we do is to drop all the gradients for <code class="highlighter-rouge">grad_out</code> and <code class="highlighter-rouge">dropout</code> which we don’t need.
+Then we run our simplification passes.</p>
+
+<p>So this is the graph we have now for forward and backward:</p>
+
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_31_0.svg" alt="svg" /></p>
+
+<p>But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and 
+split our graph. One of the difficult problems is what to do with things computed for both forward and backward. It is a hard problem, related to the MinCut problem.</p>
+
+<p>Our extremal options could be:</p>
+<ul>
+  <li>One could only keep the inputs and recompute everything as needed.</li>
+  <li>If we had a salar output, we could compute the gradient and multiply with the derivative of the later layers on backward. (Loss functions might do that.) This does not, however, work for non-scalar tensor outputs.</li>
+</ul>
+
+<p>We’ll do the following: We compute the forward normally, but we keep all things that will be used in the backward. This is too much, unfortunately, and it is very likely the reason we don’t see an end to end speedup. We’ll discuss some potential heuristics below.</p>
+
+<p>We use a coloring here. First we color all nodes of the forward computation in red. Then we traverse the gradient calculation and then color the nodes it needs from the backward blue. This gives us a chance to show off the attribute support in our visualization.</p>
+
+<p>A bit of (PyTorch) terminology: When we have a function <em>Layer : x ↦ y</em> followed by some <em>Loss: y ↦ l ∈ ℝ</em>, the backward is <em>BackwardOfLayer : grad<code class="highlighter-rouge">_</code>out ↦ grad<code class="highlighter-rouge">_</code>in</em> with <em>grad<code class="highlighter-rouge">_</code>out = dl/dy</em> and *grad<code class="highlighter-rouge">_</code>in = dl/dx`.</p>
+
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_34_0.svg" alt="svg" /></p>
+
+<p>In order to split the function as described above, we collect the blue nodes as to capture - but constants will
+just be duplicated and inputs (<code class="highlighter-rouge">Var</code> nodes) need to be treated separately.
+Now we can split out the backward, replacing all the blue nodes with variables.</p>
+
+<p>Next we take the forward and amend it to also return the required intermediates. The forward then looks like this:</p>
+
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_40_0.svg" alt="svg" /></p>
+
+<p>TVM cannot return nested tuples, so we flatten the output in the function. Again we differentiate between tensor-valued functions and tuple valued ones (i.e. those returning potentially multiple tensors).</p>
+
+<p>And at last, we can let TVM do its magic and compile our functions, say to <code class="highlighter-rouge">gr_only_compiled_module</code>
+and <code class="highlighter-rouge">fw_and_cap_compiled_module</code>.
+Time to give it a spin. We define convenience functions to move tensors between PyTorch and TVM and get the model parameters as a TVM dictionary.</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">def</span> <span class="nf">tensor_to_tvm</span><span class="p">(</span><span class="n">t</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">tvm</span><span class="o">.</span><span class="n">nd</span><span class="o">.</span><span class="n">from_dlpack</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">dlpack</span><span class="o">.</span><span class="n">to_dlpack</span><span class="p">(</span><span class="n">t</span><span class="p">))</span>
+<span class="k">def</span> <span class="nf">tensor_from_tvm</span><span class="p">(</span><span class="n">a</span><span class="p">):</span>
+    <span class="k">return</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">dlpack</span><span class="o">.</span><span class="n">from_dlpack</span><span class="p">(</span><span class="n">a</span><span class="o">.</span><span class="n">to_dlpack</span><span class="p">()))</span>
+
+<span class="n">model_params_tvm</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">tensor_to_tvm</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">pytorch_model</span><span class="o">.</span><span class="n">state_dict</span><span class="p">()</span [...]
+</code></pre></div></div>
+
+<p>Similarly, we get the inputs on the GPU in PyTorch and TVM.</p>
+
+<p>We need to deal with the dropout. It will turn out that our record of the three dropout random draws happens in the same order as the dropout in the model. We did a depth-first search on the computational graph to find them and if the values of the the dropout are connected in the graph rather than being on independent branches, this will be the order in which PyTorch draws the matrices, too. If not, good luck fiddeling with the order.</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">torch</span><span class="o">.</span><span class="n">manual_seed</span><span class="p">(</span><span class="mi">12345</span><span class="p">)</span>
+<span class="n">drop_c</span> <span class="o">=</span> <span class="p">{}</span>
+<span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">dropout_info</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span> <span class="c1"># we don't know the order
+</span>    <span class="n">p</span><span class="p">,</span> <span class="n">typ</span> <span class="o">=</span> <span class="n">dropout_info</span><span class="p">[</span><span class="n">k</span><span class="p">]</span>
+    <span class="n">drop_c</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">dropout</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">([</span><span class="nb">int</span><span class="p">(< [...]
+                                              <span class="n">dtype</span><span class="o">=</span><span class="nb">getattr</span><span class="p">(</span><span class="n">torch</span><span class="p">,</span> <span class="n">typ</span><span class="o">.</span><span class="n">dtype</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s">"cuda"</span><span class="p">),</span> <span class="n">p</span><span class="o">=</span><span class="n">p</span><s [...]
+
+<span class="n">drop_tvm</span> <span class="o">=</span> <span class="p">{</span><span class="n">n</span><span class="p">:</span> <span class="n">tensor_to_tvm</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> <span class="k">for</span> <span class="n">n</span><span class="p">,</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">drop_c</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
+</code></pre></div></div>
+
+<p>Now we can run the forward.</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">fw_and_cap_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="s">'input'</span><span class="p">,</span> <span class="n">inp_tvm</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
+<span class="n">fw_and_cap_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="s">'attention_mask'</span><span class="p">,</span> <span class="n">inp_tvm</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+<span class="n">fw_and_cap_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="o">**</span><span class="n">model_params_tvm</span><span class="p">)</span>
+<span class="n">fw_and_cap_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="o">**</span><span class="n">drop_tvm</span><span class="p">)</span>
+<span class="n">fw_and_cap_compiled_module</span><span class="o">.</span><span class="n">run</span><span class="p">()</span>
+</code></pre></div></div>
+
+<p>And we can compare the output to PyTorch’s:</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">torch</span><span class="o">.</span><span class="n">manual_seed</span><span class="p">(</span><span class="mi">12345</span><span class="p">)</span>
+<span class="n">pytorch_model</span><span class="o">.</span><span class="n">train</span><span class="p">()</span>
+<span class="n">res</span> <span class="o">=</span> <span class="n">pytorch_model</span><span class="p">(</span><span class="o">*</span><span class="n">inp_c</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+<span class="n">numpy</span><span class="o">.</span><span class="nb">abs</span><span class="p">(</span><span class="n">fw_and_cap_compiled_module</span><span class="o">.</span><span class="n">get_output</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">()</span><span class="o">-</span><span class="n">res</span><span class="o">.</span><span class="n">detach</span><span class="p">()</span><sp [...]
+</code></pre></div></div>
+
+<p>This gives <code class="highlighter-rouge">2.1457672e-06</code>.</p>
+
+<p>Supergood. Let’s also try the backward. We generate a <code class="highlighter-rouge">grad_out</code>, set all the variables and run the backward model and run the backward model</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">gr_out_c</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">res</span><span class="o">.</span><span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s">"cuda"</span><span class="p">,</span> <span class="n">dtype< [...]
+</code></pre></div></div>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">num_captures</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">capture_vars</span><span class="p">)</span>
+<span class="n">num_regular_outputs</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">fw_and_cap_fn_flattened</span><span class="o">.</span><span class="n">body</span><span class="o">.</span><span class="n">fields</span><span class="p">)</span> <span class="o">-</span> <span class="n">num_captures</span>
+<span class="n">captured_values</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="o">.</span><span class="n">name_hint</span><span class="p">:</span> <span class="n">fw_and_cap_compiled_module</span><span class="o">.</span><span class="n">get_output</span><span class="p">(</span><span class="n">num_regular_outputs</span> <span class="o">+</span> <span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span>< [...]
+
+<span class="n">gr_only_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="o">**</span><span class="n">drop_tvm</span><span class="p">)</span>
+<span class="n">gr_only_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="o">**</span><span class="n">model_params_tvm</span><span class="p">)</span>
+<span class="n">gr_only_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="o">**</span><span class="n">captured_values</span><span class="p">)</span>
+<span class="n">gr_only_compiled_module</span><span class="o">.</span><span class="n">set_input</span><span class="p">(</span><span class="s">'gr:out:0'</span><span class="p">,</span> <span class="n">tensor_to_tvm</span><span class="p">(</span><span class="n">gr_out_c</span><span class="p">))</span>
+<span class="n">gr_only_compiled_module</span><span class="o">.</span><span class="n">run</span><span class="p">()</span>
+</code></pre></div></div>
+
+<p>On the PyTorch side, it is easiest to re-run the forward (remembering to reset the random seed) and get the grads.</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">torch</span><span class="o">.</span><span class="n">manual_seed</span><span class="p">(</span><span class="mi">12345</span><span class="p">)</span>
+<span class="n">pytorch_model</span><span class="o">.</span><span class="n">train</span><span class="p">()</span>
+<span class="n">inp_c_rq</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span><span class="o">.</span><span class="n">requires_grad_</span><span class="p">()</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">inp_c</span><span class="p">]</span>
+<span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">pytorch_model</span><span class="o">.</span><span class="n">parameters</span><span class="p">():</span>
+    <span class="n">p</span><span class="o">.</span><span class="n">requires_grad_</span><span class="p">()</span>
+<span class="n">res</span> <span class="o">=</span> <span class="n">pytorch_model</span><span class="p">(</span><span class="o">*</span><span class="n">inp_c_rq</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+<span class="n">grads_pt</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">autograd</span><span class="o">.</span><span class="n">grad</span><span class="p">(</span><span class="n">res</span><span class="p">,</span> <span class="n">inp_c_rq</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">pytorch_model</span><span class="o">.</span><span class="n">parameters</span><span class="p">()),</sp [...]
+
+</code></pre></div></div>
+
+<p>Did it work? It seems so:</p>
+
+<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">g_pt</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">grads_pt</span><span class="p">):</span>
+    <span class="k">print</span><span class="p">(</span><span class="n">numpy</span><span class="o">.</span><span class="nb">abs</span><span class="p">(</span><span class="n">gr_only_compiled_module</span><span class="o">.</span><span class="n">get_output</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">()</span> <span class="o">-</span> <span class="n">g_pt</span><span class="o">.</span><s [...]
+</code></pre></div></div>
+
+<p>gives us a list of numbers in the 1e-5ish range.</p>
+
+<p>But we wanted to get something running in PyTorch, right?</p>
+
+<p>Keeping with how PyTorch works, we first define an <code class="highlighter-rouge">autograd.Function</code> that the things we just did manually:</p>
+
+<p>In the <code class="highlighter-rouge">forward</code>:</p>
+
+<ul>
+  <li>Generate the dropout random values,</li>
+  <li>Run the forward,</li>
+  <li>Record the captures, inputs, and dropout values needed for backward.</li>
+</ul>
+
+<p>In the <code class="highlighter-rouge">backward</code>, run the backward and return the result (as PyTorch tensors).</p>
+
+<p>With that, we get a PyTorch autograd.Function calling into TVM (we would want a small wrapper for that.</p>
+
+<p>Now all we need to do to achive our goal of getting a method <code class="highlighter-rouge">add_tvm_dispatch(module, sample_inputs)</code> is
+to trace the module, create the TVM-based autograd function from it and then replace the forward that calls
+that (with the parameters) if applicable or falls back to the usual forward.
+Python’s unlimited dynamism makes that kind of hackery relatively easy.
+As all this it is not really TVM-related, we are sparing us that here (but you could check the
+<a href="https://lernapparat.de/transformers-pytorch-tvm/">companion post</a>.</p>
+
+<h2 id="performance">Performance</h2>
+
+<p>As I said in the beginning, we aren’t quite where we want to eventually be in terms of performance.
+After tuning the tasks (and on the not very realistic inference example from the HuggingFace BERT + PyTorch JIT tutorial)
+we run 100 iterations of the TVM-enabled BertLayer forward and backward similar to how we did it for the inference.
+One iteration takes 6.2ms going through TVM versus 1.3ms on PyTorch.</p>
+
+<p>So ran our model through TVM all right. But it’s not as fast as the usual method yet. Here is to opportunity!</p>
+
+<p>More seriously, we have two immediate paths to improve performance:</p>
+
+<ul>
+  <li>Find a better set of captured nodes.</li>
+  <li>Find optimizations on the TVM graph.</li>
+</ul>
+
+<p>In terms of heuristics for the former (remember that it quite likely NP hard, i.e. I believe it is, but I didn’t work out a formal proof),
+one would want to re-do cheap computation, most prominently point-wise computation (or maybe anything but matmul?). But that is for another day.</p>
+
+<p>I hope you enjoyed the tutorial, I look forward to your comments at <a href="mailto:tv@lernapparat.de">tv@lernapparat.de</a>.</p>
+
+<h1 id="acknowledgements">Acknowledgements</h1>
+
+<p>I had many interesting discussions with HugingFace people and Morgan Funtowicz in particular. Also the TVM contributors had many good comments during the review of the patches TVM and on the forums. The creation of this tutorial was sponsored by AMD.</p>
+
+<h1 id="author">Author</h1>
+
+<p><a href="https://lernapparat.de/">Thomas Viehmann</a> is the founder of <a href="https://mathinf.eu/">MathInf GmbH</a>, Munich, Germany, a boutique training and consultancy firm focusing on Machine Learning and PyTorch.
+He is a PyTorch core developer and co-authored <a href="https://www.manning.com/books/deep-learning-with-pytorch">Deep Learning with PyTorch</a>, which currently available as <a href="https://pytorch.org/deep-learning-with-pytorch">free download from the PyTorch website</a>.</p>
+
+    </div>
+  </div>
+</div>
+</div>
+
+
+    
+
+
+
+
+
+    <div class="container">
+
+      <footer class="small">
+        Apache TVM is an effort undergoing incubation at The Apache Software Foundation (ASF),
+        sponsored by the <i>Apache Incubator</i>. Incubation is required
+        of all newly accepted projects until a further review indicates that the infrastructure,
+        communications, and decision making process have stabilized in a manner consistent with other
+        successful ASF projects. While incubation status is not necessarily a reflection of the completeness
+        or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
+
+        Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache,
+        the Apache feather, and the Apache TVM project logo are either trademarks or registered trademarks of the Apache Software Foundation.
+
+        See also other useful <a href="/asf" class="footer-link">ASF links</a>:
+        <a href="https://www.apache.org/" class="footer-link">Apache Homepage</a>,
+        <a href="https://www.apache.org/licenses/" class="footer-link">License</a>
+        <a href="https://www.apache.org/foundation/sponsorship.html" class="footer-link">Sponsorship</a>,
+        <a href="https://www.apache.org/security/" class="footer-link">Security</a>
+        <a href="https://www.apache.org/foundation/thanks.html" class="footer-link">Thanks</a>,
+        <a href="https://www.apache.org/events/current-event.html" class="footer-link">Current Event</a>
+
+      </footer>
+    </div>
+  </body>
+</html>
+
+
diff --git a/atom.xml b/atom.xml
index 0e92834..b91c889 100644
--- a/atom.xml
+++ b/atom.xml
@@ -4,7 +4,7 @@
  <title>TVM</title>
  <link href="https://tvm.apache.org" rel="self"/>
  <link href="https://tvm.apache.org"/>
- <updated>2020-07-10T19:53:16-07:00</updated>
+ <updated>2020-07-14T09:04:35-07:00</updated>
  <id>https://tvm.apache.org</id>
  <author>
    <name></name>
@@ -13,6 +13,529 @@
 
  
  <entry>
+   <title>Bridging PyTorch and TVM</title>
+   <link href="https://tvm.apache.org/2020/07/14/bert-pytorch-tvm"/>
+   <updated>2020-07-14T00:00:00-07:00</updated>
+   <id>https://tvm.apache.org/2020/07/14/bert-pytorch-tvm</id>
+   <content type="html">
+&lt;p&gt;(A more code-heavy variant is crossposted on the more PyTorch affine &lt;a href=&quot;https://lernapparat.de/transformers-pytorch-tvm/&quot;&gt;Lernapparat&lt;/a&gt;,
+ the Jupyter Notebook to follow along is on &lt;a href=&quot;https://github.com/t-vi/pytorch-tvmisc/tree/master/transformers-pytorch-tvm/&quot;&gt;github&lt;/a&gt;.)&lt;/p&gt;
+
+&lt;p&gt;Some of the most intriguing applications of Artificial Intelligence have been in Natural Language Processing.
+Models like BERT or GPT-2 and their variants can seemingly grasp enough of a text to continue it in a way that needs a second look to recognize as gibberish.&lt;/p&gt;
+
+&lt;p&gt;These models belong to a class of neural network architectures called &lt;em&gt;Transformers&lt;/em&gt;. One of the favourite libraries
+implementing them is the &lt;a href=&quot;https://github.com/huggingface/transformers/&quot;&gt;HuggingFace transformers library&lt;/a&gt;.&lt;/p&gt;
+
+&lt;p&gt;But, in contrast to convolutional models or LSTMs where we have heavily optimized implementations, this is not as much the case for transformers.
+So here we explore how TVM can fill the gap. We will do so in two steps:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;First we look at BERT inference and tuning that on TVM.&lt;/li&gt;
+  &lt;li&gt;Secondly, we start some more fundamental exploration of how one could use TVM for training in PyTorch.
+Given the experimental nature, we focus on feasibility more than on the performance in this part.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;optimizing-bert-inference-with-tvm&quot;&gt;Optimizing BERT Inference with TVM&lt;/h1&gt;
+
+&lt;p&gt;So how do we get BERT from the transformer library to TVM?&lt;/p&gt;
+
+&lt;p&gt;Helpfully, transformers supports tracing their model with the PyTorch JIT. We use their &lt;a href=&quot;https://huggingface.co/transformers/torchscript.html&quot;&gt;tutorial on it&lt;/a&gt;,
+specifically the part until we have a traced model.&lt;/p&gt;
+
+&lt;p&gt;The PyTorch traced model takes around 0.65-0.7 seconds for 100 runs on my AMD Radeon VII with the example inputs, which means 6.5-7ms per run.
+We can try to see if we can use TVM get faster. Let converting our model to TVM is a breeze:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;shape_list&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;debugName&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span [...]
+
+&lt;span class=&quot;n&quot;&gt;mod_bert&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params_bert&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;frontend&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt [...]
+                        &lt;span class=&quot;n&quot;&gt;shape_list&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;default_dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;float32&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;There will be a few warnings about not finding dtype information, but it goes well!
+We can now build and run it. Building follows the standard TVM recipe. We also convert the PyTorch (cpu) tensors to TVM arrays.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'rocm -model=gfx906'&lt;/span&gt;  &lt;span class=&quot;c1&quot;&gt;# use what matches your GPU
+&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;target_host&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'llvm'&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;context&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+&lt;span class=&quot;n&quot;&gt;tt_a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tokens_tensor&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span [...]
+&lt;span class=&quot;n&quot;&gt;st_a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;segments_tensors&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;s [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;backend&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compile_engine&lt;/ [...]
+&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;transform&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PassContext&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;opt_level&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt [...]
+        &lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;lib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt [...]
+                                     &lt;span class=&quot;n&quot;&gt;target&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
+                                     &lt;span class=&quot;n&quot;&gt;target_host&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;target_host&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
+                                     &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;params_bert&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;module&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;contrib&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;graph_runtime&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt; [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;This will warn us a few times times:&lt;/p&gt;
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    WARNING:autotvm:Cannot find config for ... batch_matmul.cuda .... A fallback configuration is used, which may bring great performance regression.
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Uh oh, &lt;em&gt;may bring great performance regression&lt;/em&gt;. Let us see.&lt;/p&gt;
+
+&lt;p&gt;But first we run the model and see if the outputs match:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mf&quot;&gt;8.583069e-06&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mf&quot;&gt;8.493662e-07&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Looks good. Remember that we’re computing in float32, so $10^{-6}$ish is a good result.&lt;/p&gt;
+
+&lt;p&gt;After building our model and setting the parameters, we time our model like this:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;():&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;range&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;100&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;run&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;sync&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;o&quot;&gt;%&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;timeit&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Ouch, it takes 6.65s per 100 runs, or 67ms per run of the model. That’s slow indeed. But the warning said that is was because it could not find (tuned) configurations. Let us then tune the tasks.&lt;/p&gt;
+
+&lt;p&gt;Tuning does take half a day or so (I’m basically following the TVM tuning tutorial for ResNet tuning with autotvm.)&lt;/p&gt;
+
+&lt;p&gt;After this, we can again build the model, this time with the new configuration. This time we should see no comments about missing configurations.
+Now it’s in the region of 6.5-7ms per run, similar to PyTorch. This is what we get from this very elementary optimization of our operators. We can push it a little further, though.&lt;/p&gt;
+
+&lt;p&gt;To see how, let us dive deep into BERT modeling and TVM.&lt;/p&gt;
+
+&lt;p&gt;If you don’t want to get the full details, do skip the next section and scroll down to &lt;em&gt;Results&lt;/em&gt;. I should add that I would hope that this tuning part of the tutorial will obsolete itself in the sense that in some near future, you will get much better speed right out of the box or at least after some initial tuning. So if you don’t see a speedup between here and &lt;em&gt;Results&lt;/em&gt;, that’s because I did my homework in submitting patches.&lt;/p&gt;
+
+&lt;h2 id=&quot;the-bert-model&quot;&gt;The BERT model&lt;/h2&gt;
+
+&lt;p&gt;Let us take a closer look at what’s going on in BERT.&lt;/p&gt;
+
+&lt;p&gt;Like many deep learning models, BERT comes with a bit some prologue (vocabulary embeddings) and epilogue (pooling) and the bulk is organized into similar-looking blocks, here we have 12 &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; modules.
+The &lt;code class=&quot;highlighter-rouge&quot;&gt;attention_mask&lt;/code&gt; is jsut to prevent BERT from looking at the answer when dealing with the question.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_model.svg&quot; alt=&quot;Bert Model&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;So let us zoom in and look at a BertLayer in detail, since that ultimately is what we need make fast.
+As we see in the net diagram, the main part of the &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; module is a submodule &lt;code class=&quot;highlighter-rouge&quot;&gt;BertSelfAttention&lt;/code&gt;.&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_layer.svg&quot; alt=&quot;BertLayer&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Now the &lt;code class=&quot;highlighter-rouge&quot;&gt;BertSelfAttention&lt;/code&gt; captures the famed self-attention mechanism that is the hallmark of transformer models. (I cannot recommend Sascha Rush’s &lt;a href=&quot;http://nlp.seas.harvard.edu/2018/04/03/attention.html&quot;&gt;Annotated Transformer&lt;/a&gt; enough as a detailed walkthrough.)&lt;/p&gt;
+
+&lt;h2 id=&quot;putting-the-bertlayer-under-the-microscope&quot;&gt;Putting the BertLayer under the Microscope&lt;/h2&gt;
+
+&lt;p&gt;If we want go into details, we should want to run a BertLayer individually.
+We grab the inputs of a BertLayer (see the Notebook for how) and convert a single &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; to TVM as we did for the entire model.&lt;/p&gt;
+
+&lt;p&gt;To look at the TVM module, we define a little visualization helper (loosely based on TVM &lt;a href=&quot;https://github.com/apache/incubator-tvm/pull/4370&quot;&gt;PR#4370&lt;/a&gt;).&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;graphviz&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;visualize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;expr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;collapse_small&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node_attr_di [...]
+    &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;collect_ops&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;ops&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;set&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;visitor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;e&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;e&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ir&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Op&lt;/span&gt;&lt [...]
+                &lt;span class=&quot;n&quot;&gt;ops&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;e&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;analysis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;post_order_visit&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/s [...]
+        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ops&lt;/span&gt;
+
+    &lt;span class=&quot;c1&quot;&gt;# node_dict maps a Relay node to an index (node ID)
+&lt;/span&gt;    &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;_traverse_expr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt;
+        &lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;len&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{}&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;analysis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;post_order_visit&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;expr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span& [...]
+
+    &lt;span class=&quot;n&quot;&gt;relayviz_nodes&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[]&lt;/span&gt;
+
+    &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graphviz&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Digraph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;format&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'svg'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;  [...]
+    &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;attr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'node'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'box'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+
+    &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;to_str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Constant&lt;/spa [...]
+            &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;repr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lstrip&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'Constant('&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)[:&lt [...]
+        &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;raise&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;NotImplementedError&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;to_str:&quot;&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;repr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
+
+    &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;is_small_const&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;not&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;collapse_small&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;and&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm [...]
+            &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;False&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;runtime&lt;/span&gt; [...]
+            &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;prod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; [...]
+        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;
+            
+    &lt;span class=&quot;c1&quot;&gt;# Sort by node ID
+&lt;/span&gt;    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;sorted&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&g [...]
+        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Function&lt;/spa [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'Function'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/spa [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;& [...]
+        &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Var&lt;/span&g [...]
+            &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;type_annotation&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;is&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;not&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;None&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;hasattr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;type_annotation&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'shape'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;int&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x&lt;/s [...]
+                    &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;type_annotation&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;typstr&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'Tensor[{}, {}]'&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;format&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot; [...]
+                &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;typstr&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;type_annotation&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                &lt;span class=&quot;n&quot;&gt;typstr&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'?'&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;d&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'ellipse'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;d&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;update&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_attr_dict&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span& [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+                     &lt;span class=&quot;s&quot;&gt;'{}: {}'&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;format&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+                         &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;name_hint&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;typstr&lt;/span&gt;
+                     &lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tuple&lt;/span [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'Tuple[...])'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/ [...]
+            &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;field&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fields&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;field&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/sp [...]
+        &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Constant&lt;/s [...]
+            
+            &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;not&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;is_small_const&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# small consts are shown in ops
+&lt;/span&gt;                &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'Constant({}, {})'&lt;/span&gt;&lt;span class=& [...]
+                        &lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_attr_dict&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{}))&lt;/span&gt;
+        &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Call&lt;/span& [...]
+            &lt;span class=&quot;n&quot;&gt;args_with_edge&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[]&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;arg_str_list&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[]&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;is_small_const&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;arg_str_list&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;append&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;to_str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
+                &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;arg_str_list&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;append&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'·'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;args_with_edge&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;append&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;arg_str&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;', '&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;join&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg_str_list&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ir&lt;/span&gt; [...]
+                &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;
+                &lt;span class=&quot;n&quot;&gt;attrs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;getattr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;attrs&lt;/span [...]
+                &lt;span class=&quot;c1&quot;&gt;#attrs = inspect.getmembers(node.attrs)
+&lt;/span&gt;                &lt;span class=&quot;n&quot;&gt;attr_str_list&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;+&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'='&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;+&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+                &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;attr_str_list&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;attr_str&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'| '&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;', '&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;join&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;attr_str_list&lt;/span&gt;&lt;span class=&quot;p&quot [...]
+                &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;attr_str&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;''&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                &lt;span class=&quot;n&quot;&gt;ops&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;collect_ops&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+                &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ops&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'_'&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;join&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ops&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+                &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                    &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'...'&lt;/span&gt;
+                &lt;span class=&quot;n&quot;&gt;attr_str&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;''&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'{name}({arg_str}{attr_str})'&lt;/span&gt;
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt [...]
+            &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;args_with_edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+                &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/span [...]
+        &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ir&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Op&lt;/span&gt;&l [...]
+            &lt;span class=&quot;c1&quot;&gt;# dot.node(str(node_id), 'Op {}'.format(node.name))
+&lt;/span&gt;            &lt;span class=&quot;k&quot;&gt;pass&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# covered in call
+&lt;/span&gt;        &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;T [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'TupleGetItem(idx={})'&lt;/span&gt;&lt;span class=&quot;o&quot;& [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;& [...]
+        &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Let&lt;/span&g [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'Let(XX)'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;& [...]
+            &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;& [...]
+        &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+            &lt;span class=&quot;k&quot;&gt;raise&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;RuntimeError&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+                &lt;span class=&quot;s&quot;&gt;'Unknown node type. node_id: {}, node: {}'&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;format&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;type&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;& [...]
+
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;
+
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Let’s run that on our main function. For some reason (well, to be fully general, probably) the PyTorch converter will convert &lt;code class=&quot;highlighter-rouge&quot;&gt;Linear&lt;/code&gt; layers to &lt;code class=&quot;highlighter-rouge&quot;&gt;batch_matmul&lt;/code&gt; rather than just &lt;code class=&quot;highlighter-rouge&quot;&gt;dense&lt;/code&gt;. We’ll get back to this in a bit. As TVM’s &lt;code class=&quot;highlighter-rouge&quot;&gt;batch_matmul&lt;/code&gt; has  [...]
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;visualize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'main'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;])&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_49_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;In addition to our named inputs, we see a number of unnamed (numbered) variables. These are the neural network parameters.&lt;/p&gt;
+
+&lt;p&gt;Let us compile our model.&lt;/p&gt;
+
+&lt;p&gt;Just like the full model, we can run and time our submodule after checking that it computes the same quantities.&lt;/p&gt;
+
+&lt;p&gt;100 runs take 20.2ms. The back of the envelope calculation here is that with &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; in PyTorch we are spending about 0.2ms in this layer, so about 2.4ms on 12 layers - a not the majority but a sizeable part of the 6-7ms overall runtime. Let’s compare to TVM. (A good rule is to never optimize without measuring.)&lt;/p&gt;
+
+&lt;p&gt;Similarly, TVM clocks in at 18.2ms for 100 runs. So here we are again roughly on par with PyTorch.&lt;/p&gt;
+
+&lt;p&gt;One thing we see from the picture is that the input is reshaped three times. There is a TVM optimization pass call Common Subexpression Elimination (CSE) that combines the three reshapes.
+(A while ago, this did not succeed because it had distinct shape arguments, but this was since solved by the TVM developers in the dynamic to static conversion pass.)
+Also, the model parameters that are reshaped and transposed. Can we get rid of that, too? 
+Yes. And for that we would first &lt;em&gt;bind&lt;/em&gt; the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes. 
+With the &lt;code class=&quot;highlighter-rouge&quot;&gt;Foldconstant&lt;/code&gt; pass, we can propagate the constants through the &lt;code class=&quot;highlighter-rouge&quot;&gt;transpose&lt;/code&gt;s and &lt;code class=&quot;highlighter-rouge&quot;&gt;reshape&lt;/code&gt;s to move them closer to the matmuls.&lt;/p&gt;
+
+&lt;p&gt;After these three (which TVM will do when we compile a relay model), our model looks like this:&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_72_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;And now comes an interesting trick. It is more efficient to merge the three batch matmuls with the same input into a single &lt;code class=&quot;highlighter-rouge&quot;&gt;batch_matmul&lt;/code&gt;. We implemented a pass doing this in &lt;a href=&quot;https://github.com/apache/incubator-tvm/pull/5791&quot;&gt;TVM PR 5791&lt;/a&gt;. So let’s call it and also have another constant-folding pass.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;new_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;transform&lt;/spa [...]
+&lt;span class=&quot;n&quot;&gt;new_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;transform&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;FoldConstant&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()(&lt;/span& [...]
+&lt;span class=&quot;n&quot;&gt;visualize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;new_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;main&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;])&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_74_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Awesome. After checking that we still get the same result.
+We can time again: 25.2 ms for 100 runs. It’s a bit slow again because we need to tune for the new shapes.
+After tuning, we are at 12.6ms for 100 runs, so we went from about 0.2ms to about 0.13-0.15ms, a nice speedup.
+By our handwavy calculation, this should cut 0.6-0.8ms from the total runtime, or somewhere between 5%-10%. Let’s check.&lt;/p&gt;
+
+&lt;h2 id=&quot;results-on-the-overall-bert-model-after-optimization&quot;&gt;Results on the overall BERT model after optimization&lt;/h2&gt;
+
+&lt;p&gt;Let’s define a function combining the optimization passes from above and run it on the entire BERT model.
+We go through the same exercise as above.&lt;/p&gt;
+
+&lt;p&gt;We get to 624ms for 100 runs. So yay, we went from 6.5-7ms in PyTorch to ~6.2ms in TVM. This is a 5%-10% speedup. Note that we have only taking a particular, not very large shape. A more serious analysis would consider more problem shapes.&lt;/p&gt;
+
+&lt;p&gt;We could probably take it a bit further yet - e.g. fusing the additions after the batch matmul by handling the reshape, but we’ll leave it at this for now. Also we will benefit from further improvements to TVM, so it will be interesting to see how the benchmark improves over time. In particular, the upcoming Ansor tuning mechanism seems promising.&lt;/p&gt;
+
+&lt;h2 id=&quot;a-peek-under-the-hood&quot;&gt;A peek under the hood&lt;/h2&gt;
+
+&lt;h3 id=&quot;comparing-implementation-of-models&quot;&gt;Comparing implementation of models&lt;/h3&gt;
+
+&lt;p&gt;As you can see, I have always compared PyTorch with TVM outputs to see if they’re good.
+Also, when I investigated some inner layer, I grabbed the inputs to that to convert and feed into the TVM model. I do believe that this is a very effective technique.&lt;/p&gt;
+
+&lt;p&gt;Sometimes, however, it is difficult to assess whether a deviation between the results is from numerical accuracy or from an error somewhere.
+When I initially converted the model, the the &lt;code class=&quot;highlighter-rouge&quot;&gt;SelfAttention&lt;/code&gt; submodule output was replicated by the TVM model to about 1e-6.
+However, the BertLayer conversion had something like 1-e3. I was not entirely clear whether that might be due to accumulated numerical errors or some material deviation somewhere.
+(This turned out to be the GELU activation, which was converted to FastGELU.)&lt;/p&gt;
+
+&lt;p&gt;One of the things I like to do in this case is jump to double precision and check there. Numerical errors should get much smaller, while other deviations would remain of the same order.
+With the PyTorch frontend, you can trace the model converted to float64 on the PyTorch side if you pass &lt;code class=&quot;highlighter-rouge&quot;&gt;default_dtype=&quot;float64&quot;&lt;/code&gt; to the conversion function.&lt;/p&gt;
+
+&lt;p&gt;Running the module and comparing to PyTorch should now have 1e-14 or so deviation.&lt;/p&gt;
+
+&lt;h3 id=&quot;improvements-in-tvm-to-facilitate-this-usecase&quot;&gt;Improvements in TVM to facilitate this usecase&lt;/h3&gt;
+
+&lt;p&gt;Before this worked as shown here, we had to close some gaps (but a recent git checkout will include all of them):&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;The TVM PyTorch converter did not support inputs other than fp32. We &lt;a href=&quot;https://github.com/t-vi/tvm/tree/pytorch_frontend_type_fix&quot;&gt;implemented improved conversion&lt;/a&gt;, now also included in TVM upsteam.&lt;/li&gt;
+  &lt;li&gt;The TVM schedule, i.e. the organization of the computation, of the workhorse operation, batch_matmul, was fixed and it was very slow (similar to running without a tuned schedule now). So we &lt;a href=&quot;https://github.com/apache/incubator-tvm/pull/5752&quot;&gt;implemented a tuneable schedule&lt;/a&gt;.&lt;/li&gt;
+  &lt;li&gt;The PyTorch converter produces batch matmul operations (it could probably also be changed to produce dense layers instead). But as we saw, one of the larger speed advantages is to combine Query Key and Value linear layers, so we implemented &lt;a href=&quot;https://github.com/apache/incubator-tvm/pull/5791&quot;&gt;fusing batch matmul operations&lt;/a&gt;.&lt;/li&gt;
+  &lt;li&gt;When comparing the computation results, we noticed that the &lt;a href=&quot;https://pytorch.org/docs/master/generated/torch.nn.GELU.html&quot;&gt;GELU&lt;/a&gt; function was converted to its FastGELU variant. We fixed that. (There is a &lt;em&gt;fast math&lt;/em&gt; optimization pass in TVM that does some replacement of the error function, though we didn’t check if it yields FastGELU for the GELU expressed with the error function.)&lt;/li&gt;
+  &lt;li&gt;TVM was initially (and still is to a some extent) focussed on static shapes. Recently it experiments with dynamic operations. The dynamic reshape - taking an argument for the target shape - is an early of these experiments, but as seen above, it prevented the fusion of batch matmuls because the common subexpression elimination pass didn’t detect that it could merge the identical input reshaping. This has improved recently.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;h1 id=&quot;training-pytorch-models-with-tvm-computation&quot;&gt;Training Pytorch models with TVM computation&lt;/h1&gt;
+
+&lt;p&gt;In this second part we want see if we could use TVM while training BERT in PyTorch.
+Of course, this opens an entire new can of worms as we need to deal with autodifferentiation.
+While we stay with the theme from above and take &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; as the example, our methodology is representative of non-trivial modules in general.
+We will want to divert the computation during training to TVM.&lt;/p&gt;
+
+&lt;p&gt;So the user can take a (traceable) module and do&lt;/p&gt;
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;add_tvm_dispatch(module, sample_input)
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+&lt;p&gt;and then if she calls module with inputs of the same shape as the sample_input, she’ll get the outputs computed by TVM (as PyTorch tensors, of course) and if not, it’ll just use the regular forward.&lt;/p&gt;
+
+&lt;p&gt;The but so we already hinted at the bad news: In this part we will see how to do these things. We will not yet achieve a great speedup.&lt;/p&gt;
+
+&lt;p&gt;But enough talk, let us dive right in!
+Again, we get our relay model with running a traced &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; from the transformer &lt;code class=&quot;highlighter-rouge&quot;&gt;Bert&lt;/code&gt; model through &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.relay.frontend.from_pytorch&lt;/code&gt;.&lt;/p&gt;
+
+&lt;p&gt;One thing we’ll do in between is to move from a modular interface in PyTorch - with named parameters - to a functional
+interface (which is what TVM can do for us). The first thing we want to do for that is arrange for the function arguments to be in an order that we can work with - i.e. first the direct inputs to the module and then the parameters in the same order that PyTorch uses them. After this operation, our &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer &lt;/code&gt; in TVM looks like this:&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_20_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;As in the BERT inference, we want to run some optimization passes.&lt;/p&gt;
+
+&lt;p&gt;But we also have a few new transformations:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;One particularity of the Autodifferentiation is that it’ll use a lot of &lt;code class=&quot;highlighter-rouge&quot;&gt;..._like&lt;/code&gt; operations to broadcast or “unbroadcast” (summation is the dual of broadcasting w.r.t. autodifferentiation) things. But this means that you now have two tensor arguments, even if the latter doesn’t really need a gradient. &lt;code class=&quot;highlighter-rouge&quot;&gt;ZappLike&lt;/code&gt; replaces those operations with the correspondi [...]
+  &lt;li&gt;Another thing is the “rooting” of derivatives. TVM generates a tensors with all ones of the same shape as the return values of our function as the starting point for the chain rule. These are then multiplied to the derivatives of our operations. But multiplication with ones is not doing much, so we strike that. Similarly, TVM initializes the gradient of a variable (an input) to zeros of the same shape. If it isn’t used, the gradient will be zero, but if it is, the “real gradi [...]
+  &lt;li&gt;TVM doesn’t have a training variant for the &lt;code class=&quot;highlighter-rouge&quot;&gt;LayerNorm&lt;/code&gt; (or &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchNorm&lt;/code&gt; or others). So we implement a pass to spell out the computation.&lt;/li&gt;
+  &lt;li&gt;TVM also doesn’t have training dropout. Here the problem is somewhat harder to fix, as TVM doesn’t have random currently. We instead replace the dropout by a construct taking a random bernoulli draw (of 0/1 values) and mimicking dropout with that. The idea is that we’ll use PyTorch to generate this mask for us. This has the added benefit that (if we generate dropout masks in the same order as PyTorch) we’ll get the exact same result.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;As hinted at above, TVM’s gradient taking assumes that it is the last element in the computation (the ones-Tensors discussed above). This isn’t a good fit with PyTorch’s modular view which expects a &lt;code class=&quot;highlighter-rouge&quot;&gt;grad_out&lt;/code&gt; for each output to be given. Happily, this is computationally equivalent to multiplying by grad out and summation, so we amend our function with that. We wish to be flexible, so we allow both functions returning a  [...]
+
+&lt;p&gt;With these modificaitons applied, our model looks like this:&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_25_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;Finally we can take the grad. As we get a lot of &lt;code class=&quot;highlighter-rouge&quot;&gt;let&lt;/code&gt; nodes, we bring it to normal form using the &lt;code class=&quot;highlighter-rouge&quot;&gt;ToGraphNormalForm&lt;/code&gt; pass.
+TVM’s gradient-taking returns a function that has the same parameters as the original function (in our case amended with the &lt;code class=&quot;highlighter-rouge&quot;&gt;grad_out&lt;/code&gt; and dropout) and then returns a tuple of the original return and a tuple containing gradients for all inputs.
+The first thing we do is to drop all the gradients for &lt;code class=&quot;highlighter-rouge&quot;&gt;grad_out&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;dropout&lt;/code&gt; which we don’t need.
+Then we run our simplification passes.&lt;/p&gt;
+
+&lt;p&gt;So this is the graph we have now for forward and backward:&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_31_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and 
+split our graph. One of the difficult problems is what to do with things computed for both forward and backward. It is a hard problem, related to the MinCut problem.&lt;/p&gt;
+
+&lt;p&gt;Our extremal options could be:&lt;/p&gt;
+&lt;ul&gt;
+  &lt;li&gt;One could only keep the inputs and recompute everything as needed.&lt;/li&gt;
+  &lt;li&gt;If we had a salar output, we could compute the gradient and multiply with the derivative of the later layers on backward. (Loss functions might do that.) This does not, however, work for non-scalar tensor outputs.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;We’ll do the following: We compute the forward normally, but we keep all things that will be used in the backward. This is too much, unfortunately, and it is very likely the reason we don’t see an end to end speedup. We’ll discuss some potential heuristics below.&lt;/p&gt;
+
+&lt;p&gt;We use a coloring here. First we color all nodes of the forward computation in red. Then we traverse the gradient calculation and then color the nodes it needs from the backward blue. This gives us a chance to show off the attribute support in our visualization.&lt;/p&gt;
+
+&lt;p&gt;A bit of (PyTorch) terminology: When we have a function &lt;em&gt;Layer : x ↦ y&lt;/em&gt; followed by some &lt;em&gt;Loss: y ↦ l ∈ ℝ&lt;/em&gt;, the backward is &lt;em&gt;BackwardOfLayer : grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;out ↦ grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;in&lt;/em&gt; with &lt;em&gt;grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;out = dl/dy&lt;/em&gt; and *grad&lt;code class=&quot;highlig [...]
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_34_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;In order to split the function as described above, we collect the blue nodes as to capture - but constants will
+just be duplicated and inputs (&lt;code class=&quot;highlighter-rouge&quot;&gt;Var&lt;/code&gt; nodes) need to be treated separately.
+Now we can split out the backward, replacing all the blue nodes with variables.&lt;/p&gt;
+
+&lt;p&gt;Next we take the forward and amend it to also return the required intermediates. The forward then looks like this:&lt;/p&gt;
+
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_40_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+
+&lt;p&gt;TVM cannot return nested tuples, so we flatten the output in the function. Again we differentiate between tensor-valued functions and tuple valued ones (i.e. those returning potentially multiple tensors).&lt;/p&gt;
+
+&lt;p&gt;And at last, we can let TVM do its magic and compile our functions, say to &lt;code class=&quot;highlighter-rouge&quot;&gt;gr_only_compiled_module&lt;/code&gt;
+and &lt;code class=&quot;highlighter-rouge&quot;&gt;fw_and_cap_compiled_module&lt;/code&gt;.
+Time to give it a spin. We define convenience functions to move tensors between PyTorch and TVM and get the model parameters as a TVM dictionary.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;tensor_to_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_dlpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;utils&lt;/span&gt;& [...]
+&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;tensor_from_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;utils&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dlpack&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_dlpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt; [...]
+
+&lt;span class=&quot;n&quot;&gt;model_params_tvm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_to_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;v&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Similarly, we get the inputs on the GPU in PyTorch and TVM.&lt;/p&gt;
+
+&lt;p&gt;We need to deal with the dropout. It will turn out that our record of the three dropout random draws happens in the same order as the dropout in the model. We did a depth-first search on the computational graph to find them and if the values of the the dropout are connected in the graph rather than being on independent branches, this will be the order in which PyTorch draws the matrices, too. If not, good luck fiddeling with the order.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;manual_seed&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;12345&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{}&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dropout_info&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;keys&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;():&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# we don't know the order
+&lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;p&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;typ&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dropout_info&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;functional&lt;/span&gt;&lt;spa [...]
+                                              &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;getattr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;typ&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span c [...]
+
+&lt;span class=&quot;n&quot;&gt;drop_tvm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_to_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;sp [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Now we can run the forward.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;fw_and_cap_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'input'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n& [...]
+&lt;span class=&quot;n&quot;&gt;fw_and_cap_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'attention_mask'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inp_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p [...]
+&lt;span class=&quot;n&quot;&gt;fw_and_cap_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;model_params_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;fw_and_cap_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;drop_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;fw_and_cap_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;run&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;And we can compare the output to PyTorch’s:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;manual_seed&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;12345&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;pytorch_model&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;train&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pytorch_model&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inp_c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;abs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fw_and_cap_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;This gives &lt;code class=&quot;highlighter-rouge&quot;&gt;2.1457672e-06&lt;/code&gt;.&lt;/p&gt;
+
+&lt;p&gt;Supergood. Let’s also try the backward. We generate a &lt;code class=&quot;highlighter-rouge&quot;&gt;grad_out&lt;/code&gt;, set all the variables and run the backward model and run the backward model&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;gr_out_c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;randn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;res&lt;/span&g [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;num_captures&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;len&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;capture_vars&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;num_regular_outputs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;len&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fw_and_cap_fn_flattened&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;body&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fields&lt;/span&gt;&lt;span class=&quot;p&qu [...]
+&lt;span class=&quot;n&quot;&gt;captured_values&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;v&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;name_hint&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;fw_and_cap_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot; [...]
+
+&lt;span class=&quot;n&quot;&gt;gr_only_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;drop_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;gr_only_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;model_params_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;gr_only_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;captured_values&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;gr_only_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'gr:out:0'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_to_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;gr_out_c&lt;/span&gt;&lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;gr_only_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;run&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;On the PyTorch side, it is easiest to re-run the forward (remembering to reset the random seed) and get the grads.&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;manual_seed&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;12345&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;pytorch_model&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;train&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;inp_c_rq&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;requires_grad_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; & [...]
+&lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;p&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pytorch_model&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;parameters&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;():&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;p&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;requires_grad_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;res&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pytorch_model&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inp_c_rq&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;grads_pt&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;autograd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;res&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;Did it work? It seems so:&lt;/p&gt;
+
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;g_pt&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;enumerate&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span& [...]
+    &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;abs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;gr_only_compiled_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt; [...]
+&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
+
+&lt;p&gt;gives us a list of numbers in the 1e-5ish range.&lt;/p&gt;
+
+&lt;p&gt;But we wanted to get something running in PyTorch, right?&lt;/p&gt;
+
+&lt;p&gt;Keeping with how PyTorch works, we first define an &lt;code class=&quot;highlighter-rouge&quot;&gt;autograd.Function&lt;/code&gt; that the things we just did manually:&lt;/p&gt;
+
+&lt;p&gt;In the &lt;code class=&quot;highlighter-rouge&quot;&gt;forward&lt;/code&gt;:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Generate the dropout random values,&lt;/li&gt;
+  &lt;li&gt;Run the forward,&lt;/li&gt;
+  &lt;li&gt;Record the captures, inputs, and dropout values needed for backward.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;In the &lt;code class=&quot;highlighter-rouge&quot;&gt;backward&lt;/code&gt;, run the backward and return the result (as PyTorch tensors).&lt;/p&gt;
+
+&lt;p&gt;With that, we get a PyTorch autograd.Function calling into TVM (we would want a small wrapper for that.&lt;/p&gt;
+
+&lt;p&gt;Now all we need to do to achive our goal of getting a method &lt;code class=&quot;highlighter-rouge&quot;&gt;add_tvm_dispatch(module, sample_inputs)&lt;/code&gt; is
+to trace the module, create the TVM-based autograd function from it and then replace the forward that calls
+that (with the parameters) if applicable or falls back to the usual forward.
+Python’s unlimited dynamism makes that kind of hackery relatively easy.
+As all this it is not really TVM-related, we are sparing us that here (but you could check the
+&lt;a href=&quot;https://lernapparat.de/transformers-pytorch-tvm/&quot;&gt;companion post&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h2 id=&quot;performance&quot;&gt;Performance&lt;/h2&gt;
+
+&lt;p&gt;As I said in the beginning, we aren’t quite where we want to eventually be in terms of performance.
+After tuning the tasks (and on the not very realistic inference example from the HuggingFace BERT + PyTorch JIT tutorial)
+we run 100 iterations of the TVM-enabled BertLayer forward and backward similar to how we did it for the inference.
+One iteration takes 6.2ms going through TVM versus 1.3ms on PyTorch.&lt;/p&gt;
+
+&lt;p&gt;So ran our model through TVM all right. But it’s not as fast as the usual method yet. Here is to opportunity!&lt;/p&gt;
+
+&lt;p&gt;More seriously, we have two immediate paths to improve performance:&lt;/p&gt;
+
+&lt;ul&gt;
+  &lt;li&gt;Find a better set of captured nodes.&lt;/li&gt;
+  &lt;li&gt;Find optimizations on the TVM graph.&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;p&gt;In terms of heuristics for the former (remember that it quite likely NP hard, i.e. I believe it is, but I didn’t work out a formal proof),
+one would want to re-do cheap computation, most prominently point-wise computation (or maybe anything but matmul?). But that is for another day.&lt;/p&gt;
+
+&lt;p&gt;I hope you enjoyed the tutorial, I look forward to your comments at &lt;a href=&quot;mailto:tv@lernapparat.de&quot;&gt;tv@lernapparat.de&lt;/a&gt;.&lt;/p&gt;
+
+&lt;h1 id=&quot;acknowledgements&quot;&gt;Acknowledgements&lt;/h1&gt;
+
+&lt;p&gt;I had many interesting discussions with HugingFace people and Morgan Funtowicz in particular. Also the TVM contributors had many good comments during the review of the patches TVM and on the forums. The creation of this tutorial was sponsored by AMD.&lt;/p&gt;
+
+&lt;h1 id=&quot;author&quot;&gt;Author&lt;/h1&gt;
+
+&lt;p&gt;&lt;a href=&quot;https://lernapparat.de/&quot;&gt;Thomas Viehmann&lt;/a&gt; is the founder of &lt;a href=&quot;https://mathinf.eu/&quot;&gt;MathInf GmbH&lt;/a&gt;, Munich, Germany, a boutique training and consultancy firm focusing on Machine Learning and PyTorch.
+He is a PyTorch core developer and co-authored &lt;a href=&quot;https://www.manning.com/books/deep-learning-with-pytorch&quot;&gt;Deep Learning with PyTorch&lt;/a&gt;, which currently available as &lt;a href=&quot;https://pytorch.org/deep-learning-with-pytorch&quot;&gt;free download from the PyTorch website&lt;/a&gt;.&lt;/p&gt;
+</content>
+ </entry>
+ 
+ <entry>
    <title>TinyML - How TVM is Taming Tiny</title>
    <link href="https://tvm.apache.org/2020/06/04/tinyml-how-tvm-is-taming-tiny"/>
    <updated>2020-06-04T00:00:00-07:00</updated>
@@ -37,32 +560,32 @@ A standard µTVM setup, where the host communicates with the device via JTAG.&lt
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;OPENOCD_SERVER_ADDR&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'127.0.0.1'&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;OPENOCD_SERVER_PORT&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;6666&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;TARGET&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'c -device=micro_dev'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;DEV_CONFIG&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stm32f746xx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;default_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;OPENOCD_SERVER_ADDR&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OPENOCD_SERVER_PORT&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;n&quot;&gt;TARGET&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'c -device=micro_dev'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;DEV_CONFIG&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stm32f746xx&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;default_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;OPENOCD_SERVER_ADDR&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OPENOCD_SERVER_PORT&lt;/span&gt;&lt;spa [...]
 
 &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_cifar10_cnn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
-&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Session&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;device_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;sess&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
-	&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt [...]
-  &lt;span class=&quot;n&quot;&gt;micro_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_micro_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DEV_CONFIG&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
-  &lt;span class=&quot;n&quot;&gt;graph_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graph_runtime&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt [...]
-  &lt;span class=&quot;n&quot;&gt;graph_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;run&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-  &lt;span class=&quot;n&quot;&gt;prediction&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CIFAR10_CLASSES&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;argmax&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;graph_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt; [...]
-  &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;f'prediction was &lt;/span&gt;&lt;span class=&quot;si&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;prediction&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;}&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Session&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;device_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;sess&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt [...]
+  &lt;span class=&quot;n&quot;&gt;micro_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_micro_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;DEV_CONFIG&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+  &lt;span class=&quot;n&quot;&gt;graph_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;graph_runtime&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt [...]
+  &lt;span class=&quot;n&quot;&gt;graph_mod&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;run&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+  &lt;span class=&quot;n&quot;&gt;prediction&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CIFAR10_CLASSES&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;argmax&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;graph_mod&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt; [...]
+  &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'prediction was {prediction}'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;Below are the performance results of MicroTVM, compared with &lt;a href=&quot;https://github.com/ARM-software/CMSIS_5/releases/tag/5.6.0&quot;&gt;CMSIS-NN version 5.7.0&lt;/a&gt; (commit &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;a65b7c9a&lt;/code&gt;), a hand-optimized library of ML kernels.&lt;/p&gt;
+&lt;p&gt;Below are the performance results of MicroTVM, compared with &lt;a href=&quot;https://github.com/ARM-software/CMSIS_5/releases/tag/5.6.0&quot;&gt;CMSIS-NN version 5.7.0&lt;/a&gt; (commit &lt;code class=&quot;highlighter-rouge&quot;&gt;a65b7c9a&lt;/code&gt;), a hand-optimized library of ML kernels.&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/microtvm/post-2020-05-28/cifar10-int-8-cnn.png&quot; alt=&quot;/images/microtvm/post-2020-05-28/cifar10-int-8-cnn.png&quot; width=&quot;60%&quot; /&gt;&lt;br /&gt;&lt;/p&gt;
 
 &lt;p&gt;As we can see, the out-of-the-box performance isn’t great, but this is where &lt;a href=&quot;https://dl.acm.org/doi/10.5555/3327144.3327258&quot;&gt;AutoTVM&lt;/a&gt; comes to the rescue.  We can write a schedule template for our device, do a round of autotuning, then achieve significantly better results.  To plug in our autotuned results, we only need to replace this line:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/s [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/s [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;with these lines:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;TARGET&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;autotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;apply_history_best&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
-  &lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&l [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;TARGET&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;autotvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;apply_history_best&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
+  &lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&l [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;And our results now look like this:&lt;/p&gt;
@@ -95,7 +618,7 @@ The µTVM Device Memory Layout in RAM&lt;/p&gt;
 &lt;/span&gt;         &lt;span class=&quot;s&quot;&gt;'text'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;18000&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
          &lt;span class=&quot;s&quot;&gt;'rodata'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;100&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
          &lt;span class=&quot;s&quot;&gt;'data'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;100&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
-         &lt;span class=&quot;p&quot;&gt;...&lt;/span&gt;
+         &lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;
     &lt;span class=&quot;p&quot;&gt;},&lt;/span&gt;
     &lt;span class=&quot;s&quot;&gt;'word_size'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;                        &lt;span class=&quot;c1&quot;&gt;# device word size
 &lt;/span&gt;    &lt;span class=&quot;s&quot;&gt;'thumb_mode'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;                    &lt;span class=&quot;c1&quot;&gt;# whether to use ARM's thumb ISA
@@ -131,16 +654,16 @@ The µTVM Device Memory Layout in RAM&lt;/p&gt;
 
 &lt;h2 id=&quot;device-sessions&quot;&gt;Device Sessions&lt;/h2&gt;
 
-&lt;p&gt;Given the networked nature of microcontroller interaction, we slightly deviate from standard TVM code by introducing the concept of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;MicroSession&lt;/code&gt;.&lt;/p&gt;
+&lt;p&gt;Given the networked nature of microcontroller interaction, we slightly deviate from standard TVM code by introducing the concept of &lt;code class=&quot;highlighter-rouge&quot;&gt;MicroSession&lt;/code&gt;.&lt;/p&gt;
 
 &lt;p&gt;Every piece of functionality in µTVM relies on having an open session with the target device.  If you’re familiar with TVM, you may have noticed a line of code that deviates from the norm in our first code snippet—-namely, this one:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;p&quot;&gt;...&lt;/span&gt;
-&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Session&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;device_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;sess&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
-	&lt;span class=&quot;p&quot;&gt;...&lt;/span&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;
+&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Session&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;device_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;sess&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
+	&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;Every line inside this &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;with&lt;/code&gt; block can call functions in µTVM, with the context being the device specified by &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;device_config&lt;/code&gt;.  This line is doing a number of things under the hood, so let’s unpack it.&lt;/p&gt;
+&lt;p&gt;Every line inside this &lt;code class=&quot;highlighter-rouge&quot;&gt;with&lt;/code&gt; block can call functions in µTVM, with the context being the device specified by &lt;code class=&quot;highlighter-rouge&quot;&gt;device_config&lt;/code&gt;.  This line is doing a number of things under the hood, so let’s unpack it.&lt;/p&gt;
 
 &lt;p&gt;First, it initializes a connection with your device, using whichever communication method you specified (usually OpenOCD).  The µTVM device runtime is then cross-compiled, using whichever cross-compiler you specified.  Finally, space for the compiled binary is allocated by the host, and the binary is loaded onto the device using the opened connection.&lt;/p&gt;
 
@@ -150,14 +673,14 @@ The µTVM Device Memory Layout in RAM&lt;/p&gt;
 
 &lt;p&gt;One of the core abstractions in TVM is that of a module.  A module stores a set of related functions for a particular device/runtime target.  Given that microcontrollers don’t normally have operating systems, µTVM needs to do a lot of extra work to maintain this high-level abstraction.  To see what’s going on, we’ll trace through the process of creating and loading a µTVM-compatible module.&lt;/p&gt;
 
-&lt;p&gt;Suppose we have a &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;micro.Session&lt;/code&gt; open with our device and a TVM schedule that implements 2D convolution.  If we want to load it onto our microcontroller, we need it to emit C code.  To do so, we just need to set the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;target&lt;/code&gt; in either &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;tvm.build&lt;/code&gt; o [...]
+&lt;p&gt;Suppose we have a &lt;code class=&quot;highlighter-rouge&quot;&gt;micro.Session&lt;/code&gt; open with our device and a TVM schedule that implements 2D convolution.  If we want to load it onto our microcontroller, we need it to emit C code.  To do so, we just need to set the &lt;code class=&quot;highlighter-rouge&quot;&gt;target&lt;/code&gt; in either &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.build&lt;/code&gt; or &lt;code class=&quot;highlighter-rouge&quot;&gt;relay.b [...]
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/s [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/s [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;By setting the target like so, the build process runs through our C code generation backend.  However, the resulting C module still resides on the host machine.  In order to load it onto the device, we run it through one of the core functions in the µTVM infrastructure: &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;create_micro_mod&lt;/code&gt;.  Example:&lt;/p&gt;
+&lt;p&gt;By setting the target like so, the build process runs through our C code generation backend.  However, the resulting C module still resides on the host machine.  In order to load it onto the device, we run it through one of the core functions in the µTVM infrastructure: &lt;code class=&quot;highlighter-rouge&quot;&gt;create_micro_mod&lt;/code&gt;.  Example:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;micro_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_micro_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c_ [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;micro_mod&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;micro&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_micro_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c_ [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;The line above cross-compiles the C source within the module, allocates room for the resulting binary (so it can coexist with the runtime in device memory), then sends each section of the binary to its allocated slot on the device.  Once the module binary is snug in device memory, function pointers within the binary are patched to give the module access to helper functions in the device runtime (e.g., for allocating scratchpads).&lt;/p&gt;
@@ -172,12 +695,12 @@ The µTVM Device Memory Layout in RAM&lt;/p&gt;
 &lt;p&gt;If we want to call an operator, we first need some tensors as arguments:&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;data_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_conv_inputs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;micro_dev&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
-&lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span  [...]
+&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;micro_dev&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span  [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;Based on its data type (e.g., &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int8&lt;/code&gt;, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;float32&lt;/code&gt;, etc.) and shape, each tensor’s size in bytes is calculated, and the host allocates a region of memory on the device’s heap.  The tensor’s data is then loaded into the allocated region.&lt;/p&gt;
+&lt;p&gt;Based on its data type (e.g., &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt;, &lt;code class=&quot;highlighter-rouge&quot;&gt;float32&lt;/code&gt;, etc.) and shape, each tensor’s size in bytes is calculated, and the host allocates a region of memory on the device’s heap.  The tensor’s data is then loaded into the allocated region.&lt;/p&gt;
 
 &lt;h2 id=&quot;function-calls&quot;&gt;Function Calls&lt;/h2&gt;
 
@@ -207,13 +730,13 @@ The µTVM Device Memory Layout in RAM&lt;/p&gt;
 &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;UTVMTask&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;In the strict setting, there is a single global &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;UTVMTask&lt;/code&gt; instance that we, from the host side, write into.  Once we have written to the task, the runtime has everything it needs to execute the function, and we can begin execution at the runtime’s entry point.  The runtime will perform some lightweight initialization, run our operator, then return control to the host.&lt;/p&gt;
+&lt;p&gt;In the strict setting, there is a single global &lt;code class=&quot;highlighter-rouge&quot;&gt;UTVMTask&lt;/code&gt; instance that we, from the host side, write into.  Once we have written to the task, the runtime has everything it needs to execute the function, and we can begin execution at the runtime’s entry point.  The runtime will perform some lightweight initialization, run our operator, then return control to the host.&lt;/p&gt;
 
 &lt;h3 id=&quot;lazy-execution&quot;&gt;Lazy Execution&lt;/h3&gt;
 
 &lt;p&gt;In practice, executing operators as soon as the user requests to becomes prohibitively expensive, as communication overhead begins to dominate.  We can improve the throughput of our system by delaying evaluation until the user wants the results of the call.&lt;/p&gt;
 
-&lt;p&gt;From an implementation standpoint, instead of eagerly serializing argument metadata and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;UTVMTask&lt;/code&gt; data, we now need to accumulate function call metadata on the host side, before flushing it to the device.  The device runtime also needs a few changes: (1) we must now have a global array of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;UTVMTask&lt;/code&gt; and (2) we need to loop t [...]
+&lt;p&gt;From an implementation standpoint, instead of eagerly serializing argument metadata and &lt;code class=&quot;highlighter-rouge&quot;&gt;UTVMTask&lt;/code&gt; data, we now need to accumulate function call metadata on the host side, before flushing it to the device.  The device runtime also needs a few changes: (1) we must now have a global array of &lt;code class=&quot;highlighter-rouge&quot;&gt;UTVMTask&lt;/code&gt; and (2) we need to loop through and execute each task in order. [...]
 
 &lt;h2 id=&quot;autotvm-with-microtvm&quot;&gt;AutoTVM with MicroTVM&lt;/h2&gt;
 
@@ -252,7 +775,7 @@ Diagram of CIFAR-10 CNN&lt;/p&gt;
 
 &lt;h2 id=&quot;methodology&quot;&gt;Methodology&lt;/h2&gt;
 
-&lt;p&gt;In our experiments, we use TVM from HEAD (commit &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;9fa8341&lt;/code&gt;), version 5.7.0 of CMSIS-NN (commit &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;a65b7c9a&lt;/code&gt;), version 1.16.0 of STM32CubeF7, and GCC from Arm’s GNU Tools for Arm Embedded Processors 9-2019-q4-major 9.2.1 toolchain (revision 277599).  The host machine used in our experiments runs Ubuntu Linux 18.04.4 LTS and spor [...]
+&lt;p&gt;In our experiments, we use TVM from HEAD (commit &lt;code class=&quot;highlighter-rouge&quot;&gt;9fa8341&lt;/code&gt;), version 5.7.0 of CMSIS-NN (commit &lt;code class=&quot;highlighter-rouge&quot;&gt;a65b7c9a&lt;/code&gt;), version 1.16.0 of STM32CubeF7, and GCC from Arm’s GNU Tools for Arm Embedded Processors 9-2019-q4-major 9.2.1 toolchain (revision 277599).  The host machine used in our experiments runs Ubuntu Linux 18.04.4 LTS and sports an AMD Ryzen Threadripper 2990WX 32 [...]
 
 &lt;h3 id=&quot;arm-specific-optimizations&quot;&gt;Arm-Specific Optimizations&lt;/h3&gt;
 
@@ -270,7 +793,7 @@ Diagram from CMSIS-NN paper showing a 2x2 matrix multiplication microkernel&lt;/
 &lt;p&gt;There are certainly other optimizations we could pull from CMSIS-NN to close the gap even further:&lt;/p&gt;
 
 &lt;ul&gt;
-  &lt;li&gt;Batch expansion of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int8&lt;/code&gt; weights into &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int16&lt;/code&gt;, to cut down on duplicate expansion for SIMD&lt;/li&gt;
+  &lt;li&gt;Batch expansion of &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt; weights into &lt;code class=&quot;highlighter-rouge&quot;&gt;int16&lt;/code&gt;, to cut down on duplicate expansion for SIMD&lt;/li&gt;
   &lt;li&gt;Splitting convolution into 3x3 tiles to reduce padding checks&lt;/li&gt;
 &lt;/ul&gt;
 
@@ -285,10 +808,10 @@ Diagram from CMSIS-NN paper showing a 2x2 matrix multiplication microkernel&lt;/
 &lt;p&gt;&lt;a href=&quot;https://github.com/areusch/microtvm-blogpost-eval&quot;&gt;https://github.com/areusch/microtvm-blogpost-eval&lt;/a&gt;&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn.png&quot; alt=&quot;/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn.png&quot; width=&quot;60%&quot; /&gt;&lt;br /&gt;
-&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int8&lt;/code&gt;-quantized CIFAR-10 CNN comparison on an Arm STM32F746NG (re-posted from above)&lt;/p&gt;
+&lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt;-quantized CIFAR-10 CNN comparison on an Arm STM32F746NG (re-posted from above)&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn-x86.png&quot; alt=&quot;/images/microtvm/post-2020-05-28/autotuned-cifar10-int-8-cnn-x86.png&quot; width=&quot;60%&quot; /&gt;&lt;br /&gt;
-&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int8&lt;/code&gt;-quantized CIFAR-10 CNN comparison on µTVM’s emulated host device&lt;/p&gt;
+&lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt;-quantized CIFAR-10 CNN comparison on µTVM’s emulated host device&lt;/p&gt;
 
 &lt;p&gt;On the Arm STM32-series board, we were able to improve performance by ~2x compared to the initial untuned operators, and we achieved results much closer to CMSIS-NN.  Additionally, we were able to significantly improve performance on the host emulated device.  Though the x86 &lt;strong&gt;&lt;em&gt;numbers&lt;/em&gt;&lt;/strong&gt; don’t mean much, they show we can use the same infrastructure (µTVM) to optimize performance on vastly different architectures.&lt;/p&gt;
 
@@ -331,7 +854,7 @@ Diagram from CMSIS-NN paper showing a 2x2 matrix multiplication microkernel&lt;/
 &lt;h2 id=&quot;introduction&quot;&gt;Introduction&lt;/h2&gt;
 
 &lt;p&gt;When designing accelerators, an important decision is how one will approximately represent real numbers in hardware.
-This problem has had a longstanding, industry-standard solution: the IEEE 754 floating-point standard.&lt;sup id=&quot;fnref:ieee&quot; role=&quot;doc-noteref&quot;&gt;&lt;a href=&quot;#fn:ieee&quot; class=&quot;footnote&quot;&gt;1&lt;/a&gt;&lt;/sup&gt;
+This problem has had a longstanding, industry-standard solution: the IEEE 754 floating-point standard.&lt;sup id=&quot;fnref:ieee&quot;&gt;&lt;a href=&quot;#fn:ieee&quot; class=&quot;footnote&quot;&gt;1&lt;/a&gt;&lt;/sup&gt;
 Yet,
   when trying to squeeze
   the most out of hardware
@@ -345,13 +868,13 @@ If we know the numerical requirements
   or more power efficient datatype?
 The answer is yes!
 Researchers have already begun experimenting with new datatypes in academic and industrial accelerator designs.
-For example, Google’s Tensor Processing Unit (the TPU) uses the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type: a single-precision IEEE float which has been truncated to 16 bits.
+For example, Google’s Tensor Processing Unit (the TPU) uses the &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type: a single-precision IEEE float which has been truncated to 16 bits.
 Due to the lax numerical requirements
   of many deep learning workloads,
   this truncation often has no effect
   on model accuracy,
   while instantly cutting the storage cost
-  in half.&lt;sup id=&quot;fnref:jouppi2017datacenter&quot; role=&quot;doc-noteref&quot;&gt;&lt;a href=&quot;#fn:jouppi2017datacenter&quot; class=&quot;footnote&quot;&gt;2&lt;/a&gt;&lt;/sup&gt;&lt;sup id=&quot;fnref:tensorflowbfloat&quot; role=&quot;doc-noteref&quot;&gt;&lt;a href=&quot;#fn:tensorflowbfloat&quot; class=&quot;footnote&quot;&gt;3&lt;/a&gt;&lt;/sup&gt;&lt;/p&gt;
+  in half.&lt;sup id=&quot;fnref:jouppi2017datacenter&quot;&gt;&lt;a href=&quot;#fn:jouppi2017datacenter&quot; class=&quot;footnote&quot;&gt;2&lt;/a&gt;&lt;/sup&gt;&lt;sup id=&quot;fnref:tensorflowbfloat&quot;&gt;&lt;a href=&quot;#fn:tensorflowbfloat&quot; class=&quot;footnote&quot;&gt;3&lt;/a&gt;&lt;/sup&gt;&lt;/p&gt;
 
 &lt;p&gt;Before researchers begin building hardware for their datatype, however, they first need to determine how their datatype will behave numerically in the workloads they care about.
 This often involves first building a software-emulated version of their datatype
@@ -381,8 +904,8 @@ Unlike the posits-in-Tensorflow example above, which enables a single new dataty
   using custom datatypes.
 In the Bring Your Own Datatypes framework,
   “datatype” means a scalar type:
-  &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;float32&lt;/code&gt;
-  or &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;uint8&lt;/code&gt;, for example.
+  &lt;code class=&quot;highlighter-rouge&quot;&gt;float32&lt;/code&gt;
+  or &lt;code class=&quot;highlighter-rouge&quot;&gt;uint8&lt;/code&gt;, for example.
 We do not handle more complicated data formats
   such as &lt;a href=&quot;https://en.wikipedia.org/wiki/Block_floating_point&quot; target=&quot;_blank&quot;&gt;block floating point&lt;/a&gt;
   or Intel’s &lt;a href=&quot;https://arxiv.org/abs/1711.02213&quot; target=&quot;_blank&quot;&gt;Flexpoint&lt;/a&gt;.
@@ -398,7 +921,7 @@ Additionally,
 A number of these type codes
   have hard-coded meanings in TVM,
   mapping to common datatypes
-  such as &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;float&lt;/code&gt;.
+  such as &lt;code class=&quot;highlighter-rouge&quot;&gt;int&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;float&lt;/code&gt;.
 However,
   the vast majority of type codes
   are unused.
@@ -429,21 +952,21 @@ These steps are akin to
   where the type code comes from
   the range of unused type codes
   available to custom datatypes.&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;datatype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;register&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'bfloat'&lt;/sp [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;datatype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;register&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'bfloat'&lt;/sp [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;The above code registers
-  the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;'bfloat'&lt;/code&gt; datatype
+  the &lt;code class=&quot;highlighter-rouge&quot;&gt;'bfloat'&lt;/code&gt; datatype
   with type code 150.
 This registration step
   allows TVM to parse programs
   which use the custom type:&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'x'&lt;/span&gt;&lt;spa [...]
-&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'y'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&q [...]
-&lt;span class=&quot;n&quot;&gt;x_bfloat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cast&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
-&lt;span class=&quot;n&quot;&gt;y_bfloat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cast&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'x'&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'y'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&q [...]
+&lt;span class=&quot;n&quot;&gt;x_bfloat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cast&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;y_bfloat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cast&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
 &lt;span class=&quot;n&quot;&gt;z_bfloat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x_bfloat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_bfloat&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cast&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z_bfloat&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
-&lt;span class=&quot;n&quot;&gt;program&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Function&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span  [...]
+&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cast&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z_bfloat&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;program&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Function&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span  [...]
 &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;program&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# v0.0.4
@@ -455,18 +978,18 @@ This registration step
 # }
 &lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;The program above
-  casts &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;float32&lt;/code&gt; inputs &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;x&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;y&lt;/code&gt;
-  into &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;s,
+  casts &lt;code class=&quot;highlighter-rouge&quot;&gt;float32&lt;/code&gt; inputs &lt;code class=&quot;highlighter-rouge&quot;&gt;x&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;y&lt;/code&gt;
+  into &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;s,
   adds them,
-  and casts the result back to &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;float32&lt;/code&gt;.
-Once the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type is registered,
-  TVM is able to parse the special &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dtype&lt;/code&gt; syntax
-  &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;custom[&amp;lt;typename&amp;gt;]&lt;/code&gt;,
-  where &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;&amp;lt;typename&amp;gt;&lt;/code&gt; is the name registered for the type.
+  and casts the result back to &lt;code class=&quot;highlighter-rouge&quot;&gt;float32&lt;/code&gt;.
+Once the &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type is registered,
+  TVM is able to parse the special &lt;code class=&quot;highlighter-rouge&quot;&gt;dtype&lt;/code&gt; syntax
+  &lt;code class=&quot;highlighter-rouge&quot;&gt;custom[&amp;lt;typename&amp;gt;]&lt;/code&gt;,
+  where &lt;code class=&quot;highlighter-rouge&quot;&gt;&amp;lt;typename&amp;gt;&lt;/code&gt; is the name registered for the type.
 This syntax also supports the usual
-  &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;&amp;lt;bits&amp;gt;x&amp;lt;lanes&amp;gt;&lt;/code&gt; format;
-  here, we use &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;16&lt;/code&gt; to indicate that
-  each &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; is 16 bits wide.
+  &lt;code class=&quot;highlighter-rouge&quot;&gt;&amp;lt;bits&amp;gt;x&amp;lt;lanes&amp;gt;&lt;/code&gt; format;
+  here, we use &lt;code class=&quot;highlighter-rouge&quot;&gt;16&lt;/code&gt; to indicate that
+  each &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; is 16 bits wide.
 (The number of lanes
   defaults to 1.)&lt;/p&gt;
 
@@ -476,7 +999,7 @@ This syntax also supports the usual
   it cannot yet compile it,
   as TVM does not yet understand 
   how to compile operations 
-  over the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type.
+  over the &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type.
 To compile these programs,
   we register &lt;em&gt;lowering functions&lt;/em&gt; for the custom datatype,
   which help TVM convert the operations
@@ -499,22 +1022,22 @@ Figure 1: The expected result of a user's registered lowering function. A loweri
 
 &lt;p&gt;Figure 1 shows a common pattern.
 Let’s assume we are
-  interested in exploring the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type,
+  interested in exploring the &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; type,
   and have chosen to run some workloads
-  by plugging a &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; emulation library (e.g. &lt;a href=&quot;https://github.com/biovault/biovault_bfloat16&quot; target=&quot;_blank&quot;&gt;biovault_bfloat16&lt;/a&gt;) into TVM
+  by plugging a &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; emulation library (e.g. &lt;a href=&quot;https://github.com/biovault/biovault_bfloat16&quot; target=&quot;_blank&quot;&gt;biovault_bfloat16&lt;/a&gt;) into TVM
   via the Bring Your Own Datatypes framework.
 Our workload is a simple program
-  which adds two &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; inputs.
+  which adds two &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; inputs.
 Native TVM does not understand
-  how to implement &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; addition—but it doesn’t need to,
+  how to implement &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; addition—but it doesn’t need to,
   as we have a library implementing our datatype!
-The library contains an implementation of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; addition,
+The library contains an implementation of &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; addition,
   alongside other operators such as multiplication and square root.
-To implement this &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; addition,
+To implement this &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt; addition,
   we’d just like to call into our library.
 Thus, our Add node should become a Call node,
-  calling out to a function (call it &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;BFloat16Add&lt;/code&gt;) in our library.
-To store the bits of the input &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;s
+  calling out to a function (call it &lt;code class=&quot;highlighter-rouge&quot;&gt;BFloat16Add&lt;/code&gt;) in our library.
+To store the bits of the input &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;s
   inside a type that TVM understands,
   we use 16-bit unsigned integers.
 The resulting program 
@@ -524,16 +1047,16 @@ The resulting program
 
 &lt;p&gt;To achieve the above lowering,
   we register a lowering function
-  for &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;:&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;datatype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;register_op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;datatype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_lower_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'BFloat16Add'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+  for &lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;:&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;datatype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;register_op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;datatype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_lower_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'BFloat16Add'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
     &lt;span class=&quot;s&quot;&gt;'Add'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'llvm'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'bfloat'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;The above code registers
   a lowering function
   for a specific operator (Add),
   compilation target (LLVM),
-  and datatype (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;).
+  and datatype (&lt;code class=&quot;highlighter-rouge&quot;&gt;bfloat&lt;/code&gt;).
 The first argument
   is the lowering function.
 This can be any function
@@ -542,15 +1065,15 @@ This can be any function
 In our case,
   we use a helper function
   provided by the Bring Your Own Datatypes framework.
-&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;tvm.datatype.create_lower_func('BFloat16Add')&lt;/code&gt;
+&lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.datatype.create_lower_func('BFloat16Add')&lt;/code&gt;
   creates a lowering function
   for the common pattern described above.
 The resulting function
   converts the arguments of the given node
-  to &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;uint16_t&lt;/code&gt;,
+  to &lt;code class=&quot;highlighter-rouge&quot;&gt;uint16_t&lt;/code&gt;,
   and then converts the node itself
   into a call to the given function name
-  (in this case, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;'BFloat16Add'&lt;/code&gt;).&lt;/p&gt;
+  (in this case, &lt;code class=&quot;highlighter-rouge&quot;&gt;'BFloat16Add'&lt;/code&gt;).&lt;/p&gt;
 
 &lt;p&gt;To implement a custom datatype,
   the user will need to register
@@ -591,16 +1114,16 @@ The Bring Your Own Datatypes framework
 
 &lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;
 
-&lt;div class=&quot;footnotes&quot; role=&quot;doc-endnotes&quot;&gt;
+&lt;div class=&quot;footnotes&quot;&gt;
   &lt;ol&gt;
-    &lt;li id=&quot;fn:ieee&quot; role=&quot;doc-endnote&quot;&gt;
-      &lt;p&gt;&lt;a href=&quot;https://standards.ieee.org/standard/754-2019.html&quot; target=&quot;_blank&quot;&gt;754-2019 - IEEE Standard for Floating-Point Arithmetic&lt;/a&gt; &lt;a href=&quot;#fnref:ieee&quot; class=&quot;reversefootnote&quot; role=&quot;doc-backlink&quot;&gt;&amp;#8617;&lt;/a&gt;&lt;/p&gt;
+    &lt;li id=&quot;fn:ieee&quot;&gt;
+      &lt;p&gt;&lt;a href=&quot;https://standards.ieee.org/standard/754-2019.html&quot; target=&quot;_blank&quot;&gt;754-2019 - IEEE Standard for Floating-Point Arithmetic&lt;/a&gt; &lt;a href=&quot;#fnref:ieee&quot; class=&quot;reversefootnote&quot;&gt;&amp;#8617;&lt;/a&gt;&lt;/p&gt;
     &lt;/li&gt;
-    &lt;li id=&quot;fn:jouppi2017datacenter&quot; role=&quot;doc-endnote&quot;&gt;
-      &lt;p&gt;Jouppi, Norman P., et al. “In-datacenter performance analysis of a tensor processing unit.” Proceedings of the 44th Annual International Symposium on Computer Architecture. 2017. &lt;a href=&quot;#fnref:jouppi2017datacenter&quot; class=&quot;reversefootnote&quot; role=&quot;doc-backlink&quot;&gt;&amp;#8617;&lt;/a&gt;&lt;/p&gt;
+    &lt;li id=&quot;fn:jouppi2017datacenter&quot;&gt;
+      &lt;p&gt;Jouppi, Norman P., et al. “In-datacenter performance analysis of a tensor processing unit.” Proceedings of the 44th Annual International Symposium on Computer Architecture. 2017. &lt;a href=&quot;#fnref:jouppi2017datacenter&quot; class=&quot;reversefootnote&quot;&gt;&amp;#8617;&lt;/a&gt;&lt;/p&gt;
     &lt;/li&gt;
-    &lt;li id=&quot;fn:tensorflowbfloat&quot; role=&quot;doc-endnote&quot;&gt;
-      &lt;p&gt;&lt;a href=&quot;https://cloud.google.com/tpu/docs/bfloat16&quot; target=&quot;_blank&quot;&gt;Using bfloat16 with TensorFlow models&lt;/a&gt; &lt;a href=&quot;#fnref:tensorflowbfloat&quot; class=&quot;reversefootnote&quot; role=&quot;doc-backlink&quot;&gt;&amp;#8617;&lt;/a&gt;&lt;/p&gt;
+    &lt;li id=&quot;fn:tensorflowbfloat&quot;&gt;
+      &lt;p&gt;&lt;a href=&quot;https://cloud.google.com/tpu/docs/bfloat16&quot; target=&quot;_blank&quot;&gt;Using bfloat16 with TensorFlow models&lt;/a&gt; &lt;a href=&quot;#fnref:tensorflowbfloat&quot; class=&quot;reversefootnote&quot;&gt;&amp;#8617;&lt;/a&gt;&lt;/p&gt;
     &lt;/li&gt;
   &lt;/ol&gt;
 &lt;/div&gt;
@@ -706,7 +1229,7 @@ To that end, PyTorch now has an official TVM-based backend, &lt;a href=&quot;htt
 
 &lt;p&gt;Usage is simple:&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
 torch_tvm.enable()
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
@@ -724,11 +1247,11 @@ torch_tvm.enable()
 
 &lt;p&gt;To support Relay, two features were added to the PyTorch JIT: custom transformation passes and custom subgraph interpreters.&lt;/p&gt;
 
-&lt;p&gt;When &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; is enabled, subgraphs of PyTorch IR that can be converted to Relay &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;Expr&lt;/code&gt;s will be marked as Relay-compatible.  Since PyTorch IR does not always contain shape information, none of the subgraphs can be compiled in a useful way before invocation.&lt;/p&gt;
+&lt;p&gt;When &lt;code class=&quot;highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; is enabled, subgraphs of PyTorch IR that can be converted to Relay &lt;code class=&quot;highlighter-rouge&quot;&gt;Expr&lt;/code&gt;s will be marked as Relay-compatible.  Since PyTorch IR does not always contain shape information, none of the subgraphs can be compiled in a useful way before invocation.&lt;/p&gt;
 
 &lt;p&gt;During user invocation, the PyTorch JIT runtime will determine input shape information and compile the previously marked subgraphs with the new Relay C++ &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/torch_tvm/compiler.cpp#L226-L246&quot;&gt;build system&lt;/a&gt;.  The compilation is cached based on input shapes for subsequent runs.  More details can be found in the &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/README.md&quot;&gt;README&lt;/a&gt;.&lt;/p&gt;
 
-&lt;p&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; has a continuous benchmark system set up, which is monitoring the performance of ResNet18 on CPU.
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;torch_tvm&lt;/code&gt; has a continuous benchmark system set up, which is monitoring the performance of ResNet18 on CPU.
 Out of the box TVM provides over two times the performance of the default PyTorch JIT backend for various ResNet models.
 Below is a graph that details the iterations per second achieved with 16 threads on an AWS c5n.4xlarge instance (larger is better):&lt;/p&gt;
 
@@ -744,9 +1267,9 @@ Below is a graph that details the iterations per second achieved with 16 threads
 
 &lt;h3 id=&quot;tutorial&quot;&gt;Tutorial&lt;/h3&gt;
 
-&lt;p&gt;If you have an already written PyTorch model, the easiest way to get started comes from using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;torch.jit.trace&lt;/code&gt; as follows&lt;/p&gt;
+&lt;p&gt;If you have an already written PyTorch model, the easiest way to get started comes from using &lt;code class=&quot;highlighter-rouge&quot;&gt;torch.jit.trace&lt;/code&gt; as follows&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;import torch_tvm
 from your_model import model, inputs
 
 torch_tvm.enable(opt_level=3)
@@ -774,12 +1297,12 @@ with torch.no_grad():
     print(&quot;Took {}s to run {} iters&quot;.format(tvm_time, iters))
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;Much of this code comes from &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/test/benchmarks.py&quot;&gt;benchmarks.py&lt;/a&gt;.  Note that tuned parameters for AVX2 LLVM compilation is in the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;test/&lt;/code&gt; folder of the repo.&lt;/p&gt;
+&lt;p&gt;Much of this code comes from &lt;a href=&quot;https://github.com/pytorch/tvm/blob/master/test/benchmarks.py&quot;&gt;benchmarks.py&lt;/a&gt;.  Note that tuned parameters for AVX2 LLVM compilation is in the &lt;code class=&quot;highlighter-rouge&quot;&gt;test/&lt;/code&gt; folder of the repo.&lt;/p&gt;
 
 &lt;p&gt;If you are more comfortable using Relay directly, it is possible to simply extract the expression directly from a
 PyTorch function either via (implicit) tracing or TorchScript:&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;def add(a, b, c):
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;def add(a, b, c):
     return a + b + c
 
 # via tracing
@@ -804,7 +1327,7 @@ relay_graph = torch_tvm.to_relay(mul, inputs)
    <content type="html">&lt;p&gt;Deep learning has been successfully applied to a variety of tasks.
 On real-time scenarios such as inference on autonomous vehicles, the inference speed of the model is critical.
 Network quantization is an effective approach to accelerating deep learning models.
-In quantized models, both data and model parameters are represented with low precision data types such as &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int8&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;float16&lt;/code&gt;.
+In quantized models, both data and model parameters are represented with low precision data types such as &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;float16&lt;/code&gt;.
 The lowered data bandwidth reduces the inference time and memory/storage requirements, as well as the power consumption.
 Meanwhile, under proper quantization schemes, we can minimize the accuracy drops of the quantized models.
 Therefore, quantized models are of particular interests of researchers and developers as it makes large models suitable to deploy on diverse devices, such as GPU, CPU and mobile devices.&lt;/p&gt;
@@ -826,38 +1349,38 @@ In emerging models such as ResNeXt and Deformable ConvNets, the automatic optimi
 
 &lt;h1 id=&quot;expressing-quantized-cuda-kernels-in-tvm&quot;&gt;Expressing Quantized CUDA Kernels in TVM&lt;/h1&gt;
 &lt;h2 id=&quot;leveraging-tensor-intrinsics-via-tensorization&quot;&gt;Leveraging Tensor Intrinsics via Tensorization&lt;/h2&gt;
-&lt;p&gt;Many platforms provide architecture-specific instructions for special computation patterns, for example, the SIMD instructions on x86, and the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;hfma&lt;/code&gt; instructions on CUDA.
+&lt;p&gt;Many platforms provide architecture-specific instructions for special computation patterns, for example, the SIMD instructions on x86, and the &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;hfma&lt;/code&gt; instructions on CUDA.
 These intrinsic instructions are highly optimized for specific devices.
 By leveraging hardware intrinsics, we can achieve a significant performance boost for quantized operators.&lt;/p&gt;
 
 &lt;p&gt;Currently, &lt;a href=&quot;https://devblogs.nvidia.com/mixed-precision-programming-cuda-8/&quot;&gt;dp4a&lt;/a&gt; has been extensively used in TVM int8 operators on CUDA.
-&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; is a CUDA intrinsic on Compute Capability 6.1 devices.
+&lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; is a CUDA intrinsic on Compute Capability 6.1 devices.
 It is a mixed-precision instruction that provides the efficient computation of the dot product between two 4-element 8-bit integer vectors and accumulates the result in 32-bit format.
-Using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, we can implement a dot product between 8-bit integer vectors with number of elements evenly divisible by four.
+Using &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, we can implement a dot product between 8-bit integer vectors with number of elements evenly divisible by four.
 With an efficient dot product operator, we can implement high-level operators such as 2d convolution and dense layers as these operators are commonly backed by dot products.&lt;/p&gt;
 
 &lt;p&gt;To illustrate, in 2d convolution we accumulate along the channel, the width, and the height axis of the kernel.
-This is a typical use case of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;.
+This is a typical use case of &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;.
 TVM uses tensorization to support calling external intrinsics.
-We do not need to modify the original computation declaration; we use the schedule primitive &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;tensorize&lt;/code&gt; to replace the accumulation with &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; tensor intrinsic.
+We do not need to modify the original computation declaration; we use the schedule primitive &lt;code class=&quot;highlighter-rouge&quot;&gt;tensorize&lt;/code&gt; to replace the accumulation with &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; tensor intrinsic.
 More details of tensorization can be found in the &lt;a href=&quot;https://tvm.apache.org/docs//tutorials/language/tensorize.html&quot;&gt;tutorial&lt;/a&gt;.&lt;/p&gt;
 
 &lt;h2 id=&quot;data-layout-rearrangement&quot;&gt;Data Layout Rearrangement&lt;/h2&gt;
 &lt;p&gt;One of the challenges in tensorization is that we may need to design special computation logic to adapt to the requirement of tensor intrinsics.
-Although it is natural to accumulate along the inner axis of the tensor in the dense operator, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; can be more challenging.
-In &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; we expect to take a slice in the channel dimension as the input of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; because the number of channels is typically multiple of 4 (otherwise we fall back to original &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; in NCHW layout).
+Although it is natural to accumulate along the inner axis of the tensor in the dense operator, &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; can be more challenging.
+In &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; we expect to take a slice in the channel dimension as the input of &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt; because the number of channels is typically multiple of 4 (otherwise we fall back to original &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; in NCHW layout).
 Meanwhile, to achieve memory locality, we would like to reduce along the innermost axis first.
 Taking these factors into account, we use a custom data layout to address this challenge.&lt;/p&gt;
 
-&lt;p&gt;In CUDA int8 2d convolution, we empirically choose &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;NCHW4c&lt;/code&gt; as data layout and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;OIHW4o4i&lt;/code&gt; as weight layout.
-The templates can also be easily generalized to &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;NCHW[x]c&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;OIHW[x]o[x]i&lt;/code&gt;, where x is an arbitrary positive integer divisible by four.
+&lt;p&gt;In CUDA int8 2d convolution, we empirically choose &lt;code class=&quot;highlighter-rouge&quot;&gt;NCHW4c&lt;/code&gt; as data layout and &lt;code class=&quot;highlighter-rouge&quot;&gt;OIHW4o4i&lt;/code&gt; as weight layout.
+The templates can also be easily generalized to &lt;code class=&quot;highlighter-rouge&quot;&gt;NCHW[x]c&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;OIHW[x]o[x]i&lt;/code&gt;, where x is an arbitrary positive integer divisible by four.
 In the data layout we choose, slices of channels are in the packed innermost dimension.
 Likewise, we pack slices in both the input and output channel dimensions of the weight so that the output has a consistent data layout with the input, which prevents redundant layout transformations between layers.&lt;/p&gt;
 
 &lt;p&gt;We show the computation of one element of the output of the 2d convolution in Figure 2.
 The element in each position of the super dimension (the outer dimension of the blocked layout which contains packed elements) NCHW and OIHW is the packed input and kernel, respectively.
 Each column of the packed kernel comes from a different filter.
-We calculate the dot product between the packed input and each row in the packed kernel using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, and accumulate the result to the output tensor.&lt;/p&gt;
+We calculate the dot product between the packed input and each row in the packed kernel using &lt;code class=&quot;highlighter-rouge&quot;&gt;dp4a&lt;/code&gt;, and accumulate the result to the output tensor.&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/cuda-quantized/conv2d.png&quot; alt=&quot;image&quot; width=&quot;60%&quot; /&gt;&lt;/p&gt;
 &lt;div&gt;
@@ -868,7 +1391,7 @@ Figure 2. 2D convolution with data layout in NCHW4c and weight layout in OIHW4o4
 &lt;/div&gt;
 &lt;p&gt;&lt;/p&gt;
 
-&lt;p&gt;After we have specified the layout of convolution layers, other operators such as &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;add&lt;/code&gt; and activations can automatically adapt to the chosen layout during the &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/src/relay/pass/alter_op_layout.cc&quot;&gt;AlterOpLayout&lt;/a&gt; pass in Relay.
+&lt;p&gt;After we have specified the layout of convolution layers, other operators such as &lt;code class=&quot;highlighter-rouge&quot;&gt;add&lt;/code&gt; and activations can automatically adapt to the chosen layout during the &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/src/relay/pass/alter_op_layout.cc&quot;&gt;AlterOpLayout&lt;/a&gt; pass in Relay.
 The layout transformation of the weight can be precomputed offline. Therefore, we can run the whole model in the same layout without extra overhead.&lt;/p&gt;
 
 &lt;h2 id=&quot;designing-search-space-for-automatic-optimization&quot;&gt;Designing Search Space for Automatic Optimization&lt;/h2&gt;
@@ -880,8 +1403,8 @@ For example, as caching data in the shared memory is a common practice in CUDA p
 We also do some manual tiling such as splitting axes by 4 or 16 to facilitate vectorized memory access.&lt;/p&gt;
 
 &lt;p&gt;In quantized 2d convolution, we design a search space that includes a set of tunable options, such as the tile size, the axes to fuse, configurations of loop unrolling and double buffering.
-The templates of quantized &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;dense&lt;/code&gt; on CUDA are registered under template key &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int8&lt;/code&gt;.
-During automatic tuning, we can create tuning tasks for these quantized operators by setting the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;template_key&lt;/code&gt; argument.
+The templates of quantized &lt;code class=&quot;highlighter-rouge&quot;&gt;conv2d&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;dense&lt;/code&gt; on CUDA are registered under template key &lt;code class=&quot;highlighter-rouge&quot;&gt;int8&lt;/code&gt;.
+During automatic tuning, we can create tuning tasks for these quantized operators by setting the &lt;code class=&quot;highlighter-rouge&quot;&gt;template_key&lt;/code&gt; argument.
 Details of how to launch automatic optimization can be found in the &lt;a href=&quot;https://tvm.apache.org/docs//tutorials/autotvm/tune_relay_cuda.html&quot;&gt;AutoTVM tutorial&lt;/a&gt;.&lt;/p&gt;
 
 &lt;h1 id=&quot;general-workflow&quot;&gt;General Workflow&lt;/h1&gt;
@@ -892,22 +1415,22 @@ Details of how to launch automatic optimization can be found in the &lt;a href=&
 
 &lt;p&gt;TVM provides an easy workflow to quantize trained models from other frameworks, automatically optimize operators (with AutoTVM), and deploy to different devices.&lt;/p&gt;
 
-&lt;p&gt;First, we use the Relay frontend to import existing models. Here we use an MXNet model with &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;(1, 3, 224, 224)&lt;/code&gt; input shape as an example.&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg_params&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;aux_params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&l [...]
-&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;sp [...]
+&lt;p&gt;First, we use the Relay frontend to import existing models. Here we use an MXNet model with &lt;code class=&quot;highlighter-rouge&quot;&gt;(1, 3, 224, 224)&lt;/code&gt; input shape as an example.&lt;/p&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg_params&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;aux_params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&l [...]
+&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;sym&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;sp [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;Next, we use the relay quantization API to convert it to a quantized model.&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/spa [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;quantize&lt;/spa [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;Then, we use AutoTVM to extract tuning tasks for the operators in the model and perform automatic optimization. The &lt;a href=&quot;https://tvm.apache.org/docs//tutorials/autotvm/tune_relay_cuda.html&quot;&gt;AutoTVM tutorial&lt;/a&gt; provides an example for this.&lt;/p&gt;
 
 &lt;p&gt;Finally, we build the model and run inference in the quantized mode.&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;opt_level&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt [...]
-    &lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;lib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;s [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;with&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build_config&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;opt_level&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt [...]
+    &lt;span class=&quot;n&quot;&gt;graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;lib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;s [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;p&gt;The result of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;relay.build&lt;/code&gt; is a deployable library.
+&lt;p&gt;The result of &lt;code class=&quot;highlighter-rouge&quot;&gt;relay.build&lt;/code&gt; is a deployable library.
 We can either run inference &lt;a href=&quot;https://tvm.apache.org/docs//tutorials/frontend/from_mxnet.html#execute-the-portable-graph-on-tvm&quot;&gt;on the GPU&lt;/a&gt; directly or deploy &lt;a href=&quot;https://tvm.apache.org/docs//tutorials/frontend/deploy_model_on_rasp.html#deploy-the-model-remotely-by-rpc&quot;&gt;on the remote devices&lt;/a&gt; via RPC.&lt;/p&gt;
 
 &lt;h1 id=&quot;benchmark&quot;&gt;Benchmark&lt;/h1&gt;
@@ -985,12 +1508,12 @@ integrates the TVM runtime can load these compiled modules and perform inference
 import and compilation using TVM can be found at &lt;a href=&quot;https://tvm.apache.org/docs//tutorials/&quot;&gt;tutorials&lt;/a&gt;.&lt;/p&gt;
 
 &lt;p&gt;TVM now supports deploying compiled modules through Golang. Golang applications can make use of this
-to deploy the deep learning models through TVM. The scope of this blog is the introduction of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package,
-the package build process and a sample application using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; to load a compiled module and perform inference.&lt;/p&gt;
+to deploy the deep learning models through TVM. The scope of this blog is the introduction of &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package,
+the package build process and a sample application using &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; to load a compiled module and perform inference.&lt;/p&gt;
 
 &lt;h2 id=&quot;package&quot;&gt;Package&lt;/h2&gt;
 
-&lt;p&gt;The golang package &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; is built on top of TVM’s C runtime interface. The API in this package
+&lt;p&gt;The golang package &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; is built on top of TVM’s C runtime interface. The API in this package
 abstracts the native C types and provides Golang compatible types. The package source can be found
 at &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/golang&quot;&gt;gotvm&lt;/a&gt;.&lt;/p&gt;
 
@@ -1003,10 +1526,10 @@ necessary conversions across API calls.&lt;/p&gt;
 
 &lt;h2 id=&quot;how-to&quot;&gt;How to&lt;/h2&gt;
 
-&lt;p&gt;As shown in the below diagram &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; enables golang applications to integrate deep learning models
+&lt;p&gt;As shown in the below diagram &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; enables golang applications to integrate deep learning models
 from various frameworks without the hassle of understanding each framework related interface API.
 Developers can make use of TVM to import and compile deep learning models and generate TVM artifacts.
-&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides golang friendly API to load, configure, feed input and get output.&lt;/p&gt;
+&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides golang friendly API to load, configure, feed input and get output.&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/golang/TVM-Golang-Flow.png&quot; alt=&quot;image&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 &lt;center&gt; Import, Compile, Integrate and Deploy&lt;/center&gt;
@@ -1018,8 +1541,8 @@ generates the artifacts required to integrate and deploy the model on a target.&
 
 &lt;h2 id=&quot;api&quot;&gt;API&lt;/h2&gt;
 
-&lt;p&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides a handful of datatypes and API functions to initialize, load and infer
-from a golang application. Like any other golang package we just need to import &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package here.&lt;/p&gt;
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package provides a handful of datatypes and API functions to initialize, load and infer
+from a golang application. Like any other golang package we just need to import &lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; package here.&lt;/p&gt;
 
 &lt;ul&gt;
   &lt;li&gt;Module : The Module API can be used to load a TVM compiled module into TVM runtime and access any functions.&lt;/li&gt;
@@ -1108,7 +1631,7 @@ For simplicity the error handling is ignored here, but is important in real appl
 &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; extends the TVM packed function system to support golang function closures as packed functions.
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;gotvm&lt;/code&gt; extends the TVM packed function system to support golang function closures as packed functions.
 &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/golang/sample&quot;&gt;Examples&lt;/a&gt; available to register golang
 closure as TVM packed function and invoke the same across programming language barriers.&lt;/p&gt;
 
@@ -1208,8 +1731,8 @@ Finally, the output is computed in an unpacked format and in higher precision.&l
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;Input_bitpacked&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bitpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;acti [...]
 &lt;span class=&quot;n&quot;&gt;Weights_bitpacked&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bitpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pack_axis&lt;/span&gt;&lt;span class=&quot;o&quot;& [...]
-&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span& [...]
-&lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt [...]
+&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span& [...]
+&lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt [...]
 
 &lt;span class=&quot;n&quot;&gt;stride_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_left&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_down&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_right&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_pad_tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;( [...]
@@ -1223,17 +1746,17 @@ Finally, the output is computed in an unpacked format and in higher precision.&l
 &lt;span class=&quot;n&quot;&gt;Input_padded&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Input_bitpacked&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_before&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_after&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
 
 &lt;span class=&quot;c1&quot;&gt;# Treat the bitplane axes like additional reduction axes
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&l [...]
-&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
-&lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
-&lt;span class=&quot;n&quot;&gt;ib&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt [...]
-&lt;span class=&quot;n&quot;&gt;wb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &l [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel_q&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&l [...]
+&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
+&lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;s [...]
+&lt;span class=&quot;n&quot;&gt;ib&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt [...]
+&lt;span class=&quot;n&quot;&gt;wb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight_bits&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &l [...]
 
 
-&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; [...]
-             &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;popcount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; [...]
+             &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;popcount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
                &lt;span class=&quot;n&quot;&gt;Input_padded&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;stride_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt; [...]
-               &lt;span class=&quot;n&quot;&gt;Weights_bitpacked&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ff&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
+               &lt;span class=&quot;n&quot;&gt;Weights_bitpacked&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ff&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
                &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ry&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wb&lt;/span&gt;&lt;spa [...]
 
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
@@ -1469,9 +1992,9 @@ Links to tutorials for ARM CPU, Mali GPU, NVIDIA GPU, AMD GPU are all available
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;mxnet.gluon.model_zoo.vision&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_model&lt;/span&gt;
 
 &lt;span class=&quot;n&quot;&gt;block&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;get_model&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'resnet18_v1'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pretrained&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/ [...]
-&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nnvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;frontend&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;params&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nnvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;frontend&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt [...]
 
-&lt;span class=&quot;n&quot;&gt;tasks&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;autotvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;extract_from_graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;tasks&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;autotvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;extract_from_graph&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;net&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;tune_tasks&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tasks&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tuning_option&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;There are 12 different conv2d layers in resnet-18, so we launch 12 tuning tasks.
@@ -1479,7 +2002,7 @@ For each of them, the tuner makes several hundreds of trials and picks the best
 After finishing all tuning tasks, we compile the whole network and generate a single deployable minimal library.
 One sample output is&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Extract tasks...
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Extract tasks...
 Tuning...
 [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
 [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
@@ -1873,21 +2396,21 @@ Figure 1&lt;/p&gt;
 
 &lt;p&gt;First, we compute a reference output in PyTorch:&lt;/p&gt;
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;mm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;mm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;We then define and build a TVM matrix multiplication operator, using the default
 schedule:&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;convert&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;& [...]
-    &lt;span class=&quot;n&quot;&gt;X&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
-    &lt;span class=&quot;n&quot;&gt;Y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
-
-    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
-    &lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class= [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;fmm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&q [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;convert&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;& [...]
+    &lt;span class=&quot;n&quot;&gt;X&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;Y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+
+    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
+    &lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class= [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;fmm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&q [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;For brevity, we do not cover TVM’s large collection of scheduling primitives
 that we can use to optimize matrix multiplication. If you wish to make a custom
@@ -1899,37 +2422,37 @@ found &lt;a href=&quot;https://tvm.apache.org/docs//tutorials/optimize/opt_gemm.
     &lt;span class=&quot;c1&quot;&gt;# fmm is the previously built TVM function (Python function)
 &lt;/span&gt;    &lt;span class=&quot;c1&quot;&gt;# fmm is the wrapped TVM function (Python function)
 &lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;fmm_pytorch&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_pytorch_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fmm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;z2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;z2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;56&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
     &lt;span class=&quot;n&quot;&gt;fmm_pytorch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;  [...]
+    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;  [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;and verify that the results match.&lt;/p&gt;
 
 &lt;p&gt;We can repeat the same example, but using MxNet instead:&lt;/p&gt;
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;mxnet&lt;/span&gt;
     &lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib.mxnet&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_mxnet_func&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cpu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
-    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
-    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span clas [...]
-    &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cpu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+    &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;mxnet&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span clas [...]
+    &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;X&lt;/span&gt;&lt;span class=&quo [...]
     &lt;span class=&quot;n&quot;&gt;f_mxnet&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;to_mxnet_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
     &lt;span class=&quot;n&quot;&gt;f_mxnet&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt [...]
+    &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;z&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;h2 id=&quot;under-the-hood-of-the-pytorch-example&quot;&gt;Under the hood of the PyTorch Example&lt;/h2&gt;
-&lt;p&gt;As TVM provides &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L455&quot;&gt;functions&lt;/a&gt; to convert dlpack tensors to tvm &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;NDArray&lt;/code&gt;s and
+&lt;p&gt;As TVM provides &lt;a href=&quot;https://github.com/dmlc/tvm/blob/master/include/tvm/runtime/c_runtime_api.h#L455&quot;&gt;functions&lt;/a&gt; to convert dlpack tensors to tvm &lt;code class=&quot;highlighter-rouge&quot;&gt;NDArray&lt;/code&gt;s and
 vice-versa, so all that is needed is some syntactic sugar by wrapping functions.
-&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;convert_func&lt;/code&gt; is a generic converter for frameworks using tensors with dlpack
+&lt;code class=&quot;highlighter-rouge&quot;&gt;convert_func&lt;/code&gt; is a generic converter for frameworks using tensors with dlpack
 support, and can be used to implement convenient converters, such as
-&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;to_pytorch_func&lt;/code&gt;.&lt;/p&gt;
+&lt;code class=&quot;highlighter-rouge&quot;&gt;to_pytorch_func&lt;/code&gt;.&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;convert_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_type&lt;/span&gt;&lt;span class=&quot;p&quot;&g [...]
     &lt;span class=&quot;k&quot;&gt;assert&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;callable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
     &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;_wrapper&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
-        &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ndarray&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_dlpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;to_dlpack_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
+        &lt;span class=&quot;n&quot;&gt;args&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;tuple&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ndarray&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_dlpack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;to_dlpack_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt [...]
             &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_type&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;arg&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;f [...]
         &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;args&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
@@ -1938,7 +2461,7 @@ support, and can be used to implement convenient converters, such as
 &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;to_pytorch_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
     &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch&lt;/span&gt;
     &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;torch.utils.dlpack&lt;/span&gt;
-    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;convert_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&l [...]
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;convert_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm_func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&l [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 </content>
  </entry>
@@ -2121,7 +2644,7 @@ One paricular challenge we observed, is that batch matmul is a major performance
 
 &lt;p&gt;Batch matmul computation can be described more concretely as follows:&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;void BatchedGemm(input A, input B, output C, M, N, K, batch_dimension) {
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;void BatchedGemm(input A, input B, output C, M, N, K, batch_dimension) {
   for (int i = 0; i &amp;lt; batch_dimension; ++i)  {
     DoGemm(A[i],B[i],C[i],M,K,N)
   }
@@ -2190,7 +2713,7 @@ One paricular challenge we observed, is that batch matmul is a major performance
 
 &lt;p&gt;In TVM, a general batch matmul computation can be declared as:&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
 A = tvm.placeholder((batch, M, K), name='A')
 B = tvm.placeholder((batch, K, N), name='B')
 k = tvm.reduce_axis((0, K), 'k')
@@ -2205,7 +2728,7 @@ C = tvm.compute((batch, M, N),
 
 &lt;h3 id=&quot;tuning-parameters-of-blockthread-numbers&quot;&gt;Tuning parameters of block/thread numbers&lt;/h3&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;  # thread indices
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;  # thread indices
   block_y = tvm.thread_axis(&quot;blockIdx.y&quot;)
   block_x = tvm.thread_axis(&quot;blockIdx.x&quot;)
   thread_y = tvm.thread_axis((0, num_thread_y), &quot;threadIdx.y&quot;)
@@ -2230,9 +2753,9 @@ C = tvm.compute((batch, M, N),
   s[C].bind(vty, thread_yz)
   s[C].bind(vtx, thread_xz)
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
-&lt;p&gt;We fuse the outer dimensions of the batch matmul, i.e. the BB and FF of the op’s dimension, normally known as “batch” dimension in batch matmul computation. Then we split the outer and the inner dimensions by a factor of (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;number_thread * vthread&lt;/code&gt;).&lt;/p&gt;
+&lt;p&gt;We fuse the outer dimensions of the batch matmul, i.e. the BB and FF of the op’s dimension, normally known as “batch” dimension in batch matmul computation. Then we split the outer and the inner dimensions by a factor of (&lt;code class=&quot;highlighter-rouge&quot;&gt;number_thread * vthread&lt;/code&gt;).&lt;/p&gt;
 
-&lt;p&gt;Strided pattern is not needed in batch matmul, thus the virtual thread number (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;vthread_y&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;vthread_x&lt;/code&gt;) are both set to 1.&lt;/p&gt;
+&lt;p&gt;Strided pattern is not needed in batch matmul, thus the virtual thread number (&lt;code class=&quot;highlighter-rouge&quot;&gt;vthread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;vthread_x&lt;/code&gt;) are both set to 1.&lt;/p&gt;
 
 &lt;h4 id=&quot;finding-the-best-combination-of-number_thread&quot;&gt;Finding the best combination of number_thread&lt;/h4&gt;
 
@@ -2281,7 +2804,7 @@ C = tvm.compute((batch, M, N),
   &lt;/tbody&gt;
 &lt;/table&gt;
 
-&lt;p&gt;As learned from &lt;a href=&quot;http://tvmlang.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html&quot;&gt;past experience&lt;/a&gt;, the method to find the best combination of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is through brute-force search. After a brute-force search, the best  [...]
+&lt;p&gt;As learned from &lt;a href=&quot;http://tvmlang.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example.html&quot;&gt;past experience&lt;/a&gt;, the method to find the best combination of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is through brute-force search. After a brute-force search, the best combination for current shape can be f [...]
 
 &lt;h2 id=&quot;fuse-batch-matmul-with-other-operations&quot;&gt;Fuse batch matmul with other operations&lt;/h2&gt;
 
@@ -2291,7 +2814,7 @@ C = tvm.compute((batch, M, N),
 
 &lt;p&gt;Batch matmul and broadcast add fusion computation can be declared as follows:&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
 A = tvm.placeholder((batch_size, features, M, K), name='A')
 # the shape of B is (N, K) other than (K, N) is because B is transposed is this fusion pattern
 B = tvm.placeholder((batch_size, features, N, K), name='B')
@@ -2306,7 +2829,7 @@ D = topi.broadcast_add(C, ENTER)
 
 &lt;p&gt;Batch matmul and transpose fusion computation can be declared as:&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;# computation representation
 A = tvm.placeholder((batch_size, features, M, K), name='A')
 B = tvm.placeholder((batch_size, features, K, N), name='B')
 k = tvm.reduce_axis((0, K), 'k')
@@ -2320,10 +2843,10 @@ C = tvm.compute(
 &lt;p&gt;The shape of [batch=64, heads=8, M=1, N=17, K=128] is chosen to elaborate the performance of the generated code. 17 is chosen as the sequence length since it is the average input length in our production scenarios.&lt;/p&gt;
 
 &lt;ul&gt;
-  &lt;li&gt;tf-r1.4 &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 513.9 us&lt;/li&gt;
-  &lt;li&gt;tf-r1.4 &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (separate): 541.9 us&lt;/li&gt;
-  &lt;li&gt;TVM &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 37.62 us&lt;/li&gt;
-  &lt;li&gt;TVM &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (fused): 38.39 us&lt;/li&gt;
+  &lt;li&gt;tf-r1.4 &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 513.9 us&lt;/li&gt;
+  &lt;li&gt;tf-r1.4 &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (separate): 541.9 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt;: 37.62 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;BatchMatmul&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;Transpose&lt;/code&gt; (fused): 38.39 us&lt;/li&gt;
 &lt;/ul&gt;
 
 &lt;p&gt;The kernel fusion optimization brings a further &lt;b&gt;&lt;em&gt;1.7X&lt;/em&gt;&lt;/b&gt; speed-up.&lt;/p&gt;
@@ -2538,7 +3061,7 @@ not require explicit vectorization. But also notice that the newer
 Mali Bitfrost GPUs are based on quad-style vectorization and does not
 require explicit vectorization.&lt;/li&gt;
   &lt;li&gt;All threads in Mali GPUs have individual program counters. It means
-the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;warp size&lt;/code&gt; is 1, so that branch divergence is not a major problem.&lt;/li&gt;
+the &lt;code class=&quot;highlighter-rouge&quot;&gt;warp size&lt;/code&gt; is 1, so that branch divergence is not a major problem.&lt;/li&gt;
 &lt;/ul&gt;
 
 &lt;h1 id=&quot;optimization--convolution-as-example&quot;&gt;Optimization : Convolution as Example&lt;/h1&gt;
@@ -2609,15 +3132,15 @@ tiling so that we can access the memory sequentially, which reduces
 cache miss rate.&lt;/p&gt;
 
 &lt;p&gt;We do tiling on the width dimension of the input image and CO dimension
-of the filter matrix.  This is described by &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;tvm.compute&lt;/code&gt;.&lt;/p&gt;
+of the filter matrix.  This is described by &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.compute&lt;/code&gt;.&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tiling factor
 &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;VW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# get input shape
-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;spa [...]
-&lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&qu [...]
+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&qu [...]
 &lt;span class=&quot;n&quot;&gt;TH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IH&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;H_PAD&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;TW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;IW&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;W_PAD&lt;/span&gt;
 
@@ -2635,33 +3158,33 @@ of the filter matrix.  This is described by &lt;code class=&quot;language-plaint
 &lt;span class=&quot;n&quot;&gt;oshape&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CO&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OW&lt;/span&gt;&lt;span class=&quo [...]
 
 &lt;span class=&quot;c1&quot;&gt;# define packing
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&l [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&l [...]
     &lt;span class=&quot;n&quot;&gt;data_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VH&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&q [...]
 
-&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt [...]
+&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kvshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt [...]
     &lt;span class=&quot;n&quot;&gt;kernel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&q [...]
 
 &lt;span class=&quot;c1&quot;&gt;# define convolution
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt [...]
-&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
-&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;CI&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt [...]
+&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;KW&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span cl [...]
 
-&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ovshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;sp [...]
-    &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
-            &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt [...]
+&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ovshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;sp [...]
+    &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+            &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt [...]
             &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;sp [...]
 
 &lt;span class=&quot;c1&quot;&gt;# unpack to correct layout
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;oshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/ [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;oshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;n&lt;/ [...]
                      &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;//&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;VC&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;][&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt [...]
                      &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'output_unpack'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tag&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'direct_conv_output'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;We can inspect the defined IR by&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;spa [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;spa [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;I pick the convolution part here.&lt;/p&gt;
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;produce conv {
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;produce conv {
   for (co, 0, 64) {
     for (h, 0, 56) {
       for (w, 0, 14) {
@@ -2700,35 +3223,35 @@ our code can run on Mali GPU.&lt;/p&gt;
     &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot; tile and bind 3d &quot;&quot;&quot;&lt;/span&gt;
     &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;or&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;z_factor&lt;/span&gt;
     &lt;span class=&quot;n&quot;&gt;x_factor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;x_factor&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;or&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_factor&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span clas [...]
-    &lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span clas [...]
-    &lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span clas [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class= [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class= [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class= [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class= [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class= [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class= [...]
+    &lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
 
 &lt;span class=&quot;c1&quot;&gt;# set tunable parameter
 &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# schedule data packing
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
 
 &lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
 
 &lt;span class=&quot;c1&quot;&gt;# schedule conv
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
-&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot; [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
 
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
 
-&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
@@ -2762,41 +3285,41 @@ our code can run on Mali GPU.&lt;/p&gt;
 &lt;h3 id=&quot;kernel-2-unrolling&quot;&gt;Kernel 2: unrolling&lt;/h3&gt;
 &lt;p&gt;Loop unrolling can reduce the instructions for loop control, reduce
 branch penalties and hide latency in reading memory.
-In TVM, this can be done easily by calling &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;s.unroll(axis)&lt;/code&gt;&lt;/p&gt;
+In TVM, this can be done easily by calling &lt;code class=&quot;highlighter-rouge&quot;&gt;s.unroll(axis)&lt;/code&gt;&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# set tunable parameter
 &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# schedule data packing
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
 
 &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
 
 &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# schedule conv
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
-&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot; [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
 
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
 
 &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! ADD UNROLL HERE !!&quot;&quot;&quot;&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
-&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
@@ -2839,37 +3362,37 @@ In TVM, this can be done easily by calling &lt;code class=&quot;language-plainte
 &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# schedule data packing
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cla [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
 
 &lt;span class=&quot;c1&quot;&gt;# unroll
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# schedule kernel packing
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ci&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;s [...]
 
 &lt;span class=&quot;c1&quot;&gt;# unroll
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! VECTORIZE HERE !!&quot;&quot;&quot;&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kernel_vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
 &lt;span class=&quot;c1&quot;&gt;# schedule conv
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
-&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot; [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;w&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;kc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o [...]
 
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;h&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span cl [...]
 
 &lt;span class=&quot;c1&quot;&gt;# unroll
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;kw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unroll&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vw&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;span class=&quot;s&quot;&gt;&quot;&quot;&quot;!! VECTORIZE HERE !!&quot;&quot;&quot;&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;conv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vectorize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;vc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
-&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;_&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ow&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&q [...]
 &lt;span class=&quot;n&quot;&gt;tile_and_bind3d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;co&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oh&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;spa [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
@@ -2912,7 +3435,7 @@ In TVM, this can be done easily by calling &lt;code class=&quot;language-plainte
 
 &lt;h3 id=&quot;how-to-set-the-tunable-parameter&quot;&gt;How to set the tunable parameter&lt;/h3&gt;
 &lt;p&gt;As for the tunable parameters above, some can be calculated.
-For the vectorized dimension &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;VC&lt;/code&gt;, we should fill the 128-bit register,
+For the vectorized dimension &lt;code class=&quot;highlighter-rouge&quot;&gt;VC&lt;/code&gt;, we should fill the 128-bit register,
 so it can be set as 128/32=4 for float32 and 128/16=8 for float16.&lt;/p&gt;
 
 &lt;p&gt;But more often we cannot determine the optimal value, due to the
@@ -2922,7 +3445,7 @@ IR rather than direct OpenCL code.&lt;/p&gt;
 
 &lt;h3 id=&quot;the-generated-opencl-code&quot;&gt;The generated OpenCL code&lt;/h3&gt;
 &lt;p&gt;We can view the generated OpenCL code by&lt;/p&gt;
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/sp [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;func&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/sp [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 &lt;p&gt;The OpenCL code is too long to be pasted here, and it is hard to read due
 to heavy unrolling. If interested, you can view it
@@ -2933,7 +3456,7 @@ to heavy unrolling. If interested, you can view it
 different backends on some popular deep neural networks.
 Our test environment is&lt;/p&gt;
 
-&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Firefly-RK3399 4G
+&lt;div class=&quot;highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;Firefly-RK3399 4G
 CPU: dual-core Cortex-A72 + quad-core Cortex-A53
 GPU: Mali-T860MP4
 
@@ -3069,7 +3592,7 @@ In order to optimize a computation task, one has to edit the code on the develop
 
 &lt;h2 id=&quot;run-tvm-app-on-android-phone&quot;&gt;Run TVM APP on Android Phone&lt;/h2&gt;
 
-&lt;p&gt;You can find Android RPC APP in &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/android_rpc&quot;&gt;apps/android_rpc&lt;/a&gt;. Please follow the instruction to build for your Android device. Once the APK is built, sign it using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;apps/android_rpc/dev_tools&lt;/code&gt; and install it on the phone. The APP looks like:&lt;/p&gt;
+&lt;p&gt;You can find Android RPC APP in &lt;a href=&quot;https://github.com/dmlc/tvm/tree/master/apps/android_rpc&quot;&gt;apps/android_rpc&lt;/a&gt;. Please follow the instruction to build for your Android device. Once the APK is built, sign it using &lt;code class=&quot;highlighter-rouge&quot;&gt;apps/android_rpc/dev_tools&lt;/code&gt; and install it on the phone. The APP looks like:&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/android_rpc/app.png&quot; alt=&quot;image&quot; width=&quot;25%&quot; /&gt;
 &lt;img src=&quot;/images/android_rpc/app_error.png&quot; alt=&quot;image&quot; width=&quot;25%&quot; /&gt;&lt;/p&gt;
@@ -3084,17 +3607,17 @@ In order to optimize a computation task, one has to edit the code on the develop
 &lt;p&gt;Now we can connect to the proxy server from the laptop:&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;9090&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;9090&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/sp [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;This will give us a handler &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;remote&lt;/code&gt; which we can use to communicate with the mobile phone. For instance, the following lines create a 1024x1024 matrix on phone’s GPU:&lt;/p&gt;
+&lt;p&gt;This will give us a handler &lt;code class=&quot;highlighter-rouge&quot;&gt;remote&lt;/code&gt; which we can use to communicate with the mobile phone. For instance, the following lines create a 1024x1024 matrix on phone’s GPU:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span [...]
-	&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;&lt;span cla [...]
-	&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span [...]
+	&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;&lt;span cla [...]
+	&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;When &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;A.asnumpy()&lt;/code&gt; is called from the laptop, the matrix &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;A &lt;/code&gt;will be copied to phone’s RAM and then transfer to the laptop through the proxy server. The TVM RPC interface is transparent to users.&lt;/p&gt;
+&lt;p&gt;When &lt;code class=&quot;highlighter-rouge&quot;&gt;A.asnumpy()&lt;/code&gt; is called from the laptop, the matrix &lt;code class=&quot;highlighter-rouge&quot;&gt;A &lt;/code&gt;will be copied to phone’s RAM and then transfer to the laptop through the proxy server. The TVM RPC interface is transparent to users.&lt;/p&gt;
 
 &lt;h2 id=&quot;gemm-matrix-multiplication-on-the-phone&quot;&gt;GEMM (Matrix Multiplication) on the Phone&lt;/h2&gt;
 
@@ -3102,34 +3625,34 @@ In order to optimize a computation task, one has to edit the code on the develop
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;gemm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
-    &lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
-    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
+    &lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
+    &lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span c [...]
+    &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span  [...]
 
-    &lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
         &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
-        &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ii&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;jj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span  [...]
+        &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ii&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;jj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sum&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span  [...]
         &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'C'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
-    &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;threadIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;threadIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
-    &lt;span class=&quot;n&quot;&gt;bo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&qu [...]
-    &lt;span class=&quot;n&quot;&gt;to&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&qu [...]
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;bo&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;to&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quo [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=&q [...]
+    &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ti&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class=& [...]
 
-    &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&q [...]
+    &lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;lower&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&q [...]
 
-    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
     	&lt;span class=&quot;s&quot;&gt;&quot;opencl&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
     	&lt;span class=&quot;n&quot;&gt;target_host&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;llvm -target=arm64-linux-android&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
     	&lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;There’s nothing special except the last line. Here we set the target to ‘opencl’ since this is the computation language which our Mali GPU supports. Note that we set &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;target_host&lt;/code&gt; to ‘&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;llvm -target=arm64-linux-android&lt;/code&gt;’, it depends on what architecture your Android Phone is. We tested on Samsung Galaxy S6 Edge, which has a Ma [...]
+&lt;p&gt;There’s nothing special except the last line. Here we set the target to ‘opencl’ since this is the computation language which our Mali GPU supports. Note that we set &lt;code class=&quot;highlighter-rouge&quot;&gt;target_host&lt;/code&gt; to ‘&lt;code class=&quot;highlighter-rouge&quot;&gt;llvm -target=arm64-linux-android&lt;/code&gt;’, it depends on what architecture your Android Phone is. We tested on Samsung Galaxy S6 Edge, which has a Mali-T760 GPU. Here is the CPU info for  [...]
 
 &lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nv&quot;&gt;$ &lt;/span&gt;adb shell
 shell@zenltechn:/ &lt;span class=&quot;nv&quot;&gt;$ &lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;cat&lt;/span&gt; /proc/cpuinfo
@@ -3154,17 +3677,17 @@ Hardware	: SAMSUNG Exynos7420
 
 &lt;p&gt;Please refer to &lt;a href=&quot;https://clang.llvm.org/docs/CrossCompilation.html#target-triple&quot;&gt;target triple&lt;/a&gt; to learn the compile options for LLVM.&lt;/p&gt;
 
-&lt;p&gt;We use &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;tvm.contrib.ndk&lt;/code&gt; to build the shared library for the Android system,&lt;/p&gt;
+&lt;p&gt;We use &lt;code class=&quot;highlighter-rouge&quot;&gt;tvm.contrib.ndk&lt;/code&gt; to build the shared library for the Android system,&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kn&quot;&gt;from&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm.contrib&lt;/span&gt; &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;util&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;, [...]
 &lt;span class=&quot;n&quot;&gt;N&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gemm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bn&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;256&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;util&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tempdir&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relpath&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;export_library&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ndk&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_shared&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;util&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tempdir&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;temp&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relpath&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;export_library&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ndk&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_shared&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;ndk.create_shared&lt;/code&gt; reads the environment variable &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;TVM_NDK_CC&lt;/code&gt; to find the compiler &amp;amp; linker for the Android device. We can easily use NDK to generate standalone toolchain for our device. For example, the following commands generate standalone compilers and linkers for ARM64 Android devices.&lt;/p&gt;
+&lt;p&gt;&lt;code class=&quot;highlighter-rouge&quot;&gt;ndk.create_shared&lt;/code&gt; reads the environment variable &lt;code class=&quot;highlighter-rouge&quot;&gt;TVM_NDK_CC&lt;/code&gt; to find the compiler &amp;amp; linker for the Android device. We can easily use NDK to generate standalone toolchain for our device. For example, the following commands generate standalone compilers and linkers for ARM64 Android devices.&lt;/p&gt;
 
 &lt;div class=&quot;language-bash highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;nb&quot;&gt;cd&lt;/span&gt; /opt/android-ndk/build/tools/
 ./make-standalone-toolchain.sh &lt;span class=&quot;nt&quot;&gt;--platform&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;android-24 &lt;span class=&quot;nt&quot;&gt;--use-llvm&lt;/span&gt; &lt;span class=&quot;nt&quot;&gt;--arch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;arm64 &lt;span class=&quot;nt&quot;&gt;--install-dir&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;/opt/android-toolchain-arm64
@@ -3172,34 +3695,34 @@ Hardware	: SAMSUNG Exynos7420
 
 &lt;p&gt;If everything goes right, we’ve got a shared library ‘gemm_gpu.so’. Now let’s upload it to the mobile phone, make the phone load the module and get a remote handler,&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&qu [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rpc&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;connect&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;0.0.0.0&qu [...]
 
-&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;upload&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;upload&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;path_dso&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;gemm_gpu.so&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;Create the remote arrays and print the running time,&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;sp [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;remote&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cl&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;sp [...]
 
 &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;numpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
-&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
+&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uniform&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;size&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cla [...]
 
-&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
-&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
-&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
 
-&lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;time_evaluator&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;entry_name&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &l [...]
-&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quo [...]
-&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'%g secs/op, %g GFLOPS'&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;%&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ngflops&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;N&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;time_evaluator&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;entry_name&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &l [...]
+&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;span class=&quot [...]
+&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;%&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;g secs/op, &lt;/span&gt;&lt;span class=&quot;si&quot;&gt;%&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;g GFLOPS'&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;%&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cost&lt;/span&gt;&lt [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;Now we can verify the results on PC,&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_almost_equal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
-	&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;
-	&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_almost_equal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;
+	&lt;span class=&quot;n&quot;&gt;a_np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b_np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
 	&lt;span class=&quot;n&quot;&gt;decimal&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;3&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
@@ -3303,7 +3826,7 @@ make jvminstall
 
 &lt;ul&gt;
   &lt;li&gt;Loads Resnet 50 model from &lt;a href=&quot;https://mxnet.incubator.apache.org/versions/master/api/python/gluon/model_zoo.html&quot;&gt;the Gluon model zoo&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;Converts Gluon Resnet 50 model to NNVM graph format, using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;nnvm.frontend.from_mxnet (...)&lt;/code&gt;&lt;/li&gt;
+  &lt;li&gt;Converts Gluon Resnet 50 model to NNVM graph format, using &lt;code class=&quot;highlighter-rouge&quot;&gt;nnvm.frontend.from_mxnet (...)&lt;/code&gt;&lt;/li&gt;
   &lt;li&gt;Compiles and executes the graph with ROCm backend&lt;/li&gt;
 &lt;/ul&gt;
 
@@ -3320,7 +3843,7 @@ TVM prediction top-1: 282 tiger cat&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
 
 &lt;p&gt;The script &lt;a href=&quot;https://github.com/ROCmSoftwarePlatform/nnvm-rocm/blob/master/advanced_superres_onnx.py&quot;&gt;advanced_superres_onnx.py&lt;/a&gt; gives an example of loading a model trained with PyTorch. The model is stored in the &lt;a href=&quot;https://onnx.ai/&quot;&gt;ONNX&lt;/a&gt; format. In this example, our network takes an low resolution image as input, and outputs a 4x high resolution image. We refer the details of a problem setup and the network archit [...]
 
-&lt;p&gt;In order to use models in the ONNX format with NNVM, we first use &lt;a href=&quot;https://github.com/onnx/onnx&quot;&gt;the ONNX library&lt;/a&gt; to load the ONNX model into the Protocol buffer object. We can then use &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;nnvm.frontend.from_onnx(...)&lt;/code&gt; to obtain an equivalent NNVM graph. With a NNVM graph in hand, we can follow the generic workflow of compilation and graph execution outlined above.&lt;/p&gt;
+&lt;p&gt;In order to use models in the ONNX format with NNVM, we first use &lt;a href=&quot;https://github.com/onnx/onnx&quot;&gt;the ONNX library&lt;/a&gt; to load the ONNX model into the Protocol buffer object. We can then use &lt;code class=&quot;highlighter-rouge&quot;&gt;nnvm.frontend.from_onnx(...)&lt;/code&gt; to obtain an equivalent NNVM graph. With a NNVM graph in hand, we can follow the generic workflow of compilation and graph execution outlined above.&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/rocm/butterfly.png&quot; alt=&quot;image&quot; /&gt;&lt;/p&gt;
 
@@ -3345,46 +3868,46 @@ We are starting to look at performance optimization and we expect more improveme
 &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;tvm&lt;/span&gt;
 &lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;numpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;
 
-&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;n&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
-&lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
-&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&q [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&q [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&q [...]
+&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;var&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;n&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;B&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;placeholder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span cl [...]
+&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&q [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p& [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;C&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&qu [...]
 
 &lt;p&gt;Next, to use ROCm backend we build our kernel under “rocm” target. This will cause TVM to use our new code generator. We also need a runtime context for ROCm backend.&lt;/p&gt;
 
 &lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;rocm&quot;&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class= [...]
-&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+&lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;build&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;A&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;ctx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
 
 &lt;p&gt;After building the kernel and setting up a runtime context, we can launch our vector add kernel.&lt;/p&gt;
 
 &lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1024&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
-&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
-&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
+&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nd&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n [...]
 
 &lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt; &l [...]
+&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;testing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;assert_allclose&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;asnumpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt; &l [...]
 
 &lt;p&gt;We can view LLVM IR that TVM generates in the following way:&lt;/p&gt;
 
-&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;spa [...]
-&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;llvm&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;fadd_rocm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imported_modules&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;spa [...]
+&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;llvm&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/figure&gt;
 
 &lt;p&gt;You should see something like this:&lt;/p&gt;
 
 &lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-llvm&quot; data-lang=&quot;llvm&quot;&gt;&lt;span class=&quot;c1&quot;&gt;; ModuleID = 'myadd__kernel0'&lt;/span&gt;
-&lt;span class=&quot;k&quot;&gt;source_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
+&lt;span class=&quot;err&quot;&gt;source_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;datalayout&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64&quot;&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;triple&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;amdgcn-amd-amdhsa-hcc&quot;&lt;/span&gt;
 
 
 &lt;span class=&quot;c1&quot;&gt;; Function Attrs: nounwind&lt;/span&gt;
-&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;amdgpu_kernel&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=& [...]
+&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;err&quot;&gt;amdgpu_kernel&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class [...]
 &lt;span class=&quot;nl&quot;&gt;entry:&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workgroup.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workitem.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
@@ -3428,7 +3951,7 @@ We are starting to look at performance optimization and we expect more improveme
 
 &lt;p&gt;We can also view GPU assembly that ROCm backend generates. This is the real code that runs on your GPU.&lt;/p&gt;
 
-&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;asm&quot;&lt;/span&gt;&lt;sp [...]
+&lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-python&quot; data-lang=&quot;python&quot;&gt;&lt;span class=&quot;k&quot;&gt;print&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dev_module&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get_source&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;asm&quot;&lt;/span&gt;&lt;sp [...]
 
 &lt;p&gt;The assembly should look something like this, omitting unnecessary details:&lt;/p&gt;
 
@@ -3607,18 +4130,18 @@ It’s an effective method to reduce the computation complexity of deep neural n
 &lt;p&gt;In TVM, depthwise convolution can be declared as:&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# padding stage
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
     &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;in_channel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;height_after_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;width_after_pad&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
-    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
-        &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;all&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt;  [...]
-        &lt;span class=&quot;n&quot;&gt;Input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span  [...]
+    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+        &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;all&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt;  [...]
+        &lt;span class=&quot;n&quot;&gt;Input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pad_top&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span  [...]
     &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;PaddedInput&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;span class=&quot;c1&quot;&gt;# depthconv stage
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),& [...]
-&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; & [...]
-&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),& [...]
+&lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reduce_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;filter_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; & [...]
+&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
     &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;batch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_channel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;out_width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
-    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;k&quot;&gt;lambda&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;j&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=& [...]
         &lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;b&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;channel_multiplier&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/spa [...]
         &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;di&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dj&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/span&gt;
     &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'DepthwiseConv2d'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
@@ -3671,21 +4194,21 @@ To avoid bank conflicts, it’s better that successive threads access successive
 &lt;h3 id=&quot;compute-paddedinput-inline-to-save-memory-allocation&quot;&gt;Compute PaddedInput Inline to Save Memory Allocation&lt;/h3&gt;
 &lt;p&gt;As we see from part 1, padding is declared explicitly as a separate stage. We compute it inline to avoid redundant memory allocation:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/sp [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;h3 id=&quot;divide-one-large-channel-into-smaller-blocks&quot;&gt;Divide One Large Channel into Smaller Blocks&lt;/h3&gt;
 &lt;p&gt;One straightforward schedule for depthwise convolution is that one cuda block takes care of one input channel and corresponding filters, loading them into shared memory and then computing:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;IS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/spa [...]
-&lt;span class=&quot;n&quot;&gt;FS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;shared&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span& [...]
-&lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.y&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;IS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PaddedInput&lt;/spa [...]
+&lt;span class=&quot;n&quot;&gt;FS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;cache_read&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Filter&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;shared&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span& [...]
+&lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.y&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;blockIdx.x&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;span class=&quot;c1&quot;&gt;# bind the dimension of batch (N in NCHW) with block_y
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt; [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;s [...]
 &lt;span class=&quot;c1&quot;&gt;# bind the dimension of channel (C in NCHW) with block_x
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt; [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;op&lt;/span&gt;&lt;s [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;We test the average time cost of 1000 runs on GTX 1080, and compare with &lt;a href=&quot;https://www.tensorflow.org/versions/r0.12/api_docs/python/nn/convolution#depthwise_conv2d&quot;&gt;depthwise_conv2d in tensorflow&lt;/a&gt;.
@@ -3742,14 +4265,14 @@ and one cuda block takes care of one 32 x 32 block:&lt;/p&gt;
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;blocking_h&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;blocking_w&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;
 &lt;span class=&quot;c1&quot;&gt;# split the dimension of height (H in NCHW)
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt; [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;s [...]
 &lt;span class=&quot;c1&quot;&gt;# split the dimension of width (W in NCHW)
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt; [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;s [...]
 &lt;span class=&quot;c1&quot;&gt;# assign one 32 x 32 block to one cuda block
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&l [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&q [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;by&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_y&lt;/span&gt;&lt;span class=& [...]
+&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fuse&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx1&lt;/span&gt;&lt;span class=&quo [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;block_x&lt;/span&gt;&lt;span class=& [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;Here is the new result:&lt;/p&gt;
@@ -3787,16 +4310,16 @@ and one cuda block takes care of one 32 x 32 block:&lt;/p&gt;
 
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
-&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
-&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&q [...]
-&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&q [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=& [...]
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&qu [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class= [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;There are two parameters in the schedule: &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt;. How to determine the optimal combination of them? 
+&lt;p&gt;There are two parameters in the schedule: &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt;. How to determine the optimal combination of them? 
 Well, let’s first do some experiments. Below is the result with Filter = [256, 1, 3, 3] and stride = [1, 1]:&lt;/p&gt;
 
 &lt;table&gt;
@@ -3852,7 +4375,7 @@ It has better data reuse than case 1’s 4x1 tile.&lt;/p&gt;
     &lt;p&gt;Case 3 is slower than case 2. It’s because in case 3, the workload per thread is too large and leads to much cost of local memory read.&lt;/p&gt;
   &lt;/li&gt;
   &lt;li&gt;
-    &lt;p&gt;Case 4 is slower than case 3. It’s because &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_x = 32&lt;/code&gt; ensures no bank conflicts, while &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_y = 32&lt;/code&gt; doesn’t.&lt;/p&gt;
+    &lt;p&gt;Case 4 is slower than case 3. It’s because &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x = 32&lt;/code&gt; ensures no bank conflicts, while &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y = 32&lt;/code&gt; doesn’t.&lt;/p&gt;
   &lt;/li&gt;
 &lt;/ul&gt;
 
@@ -3860,14 +4383,14 @@ It has better data reuse than case 1’s 4x1 tile.&lt;/p&gt;
 
 &lt;ul&gt;
   &lt;li&gt;Large tile is good for data reuse, but not good for local memory read.&lt;/li&gt;
-  &lt;li&gt;The influence of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; on bank conflicts is asymmetric.&lt;/li&gt;
-  &lt;li&gt;To find the optimal combination of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is to achieve a balance of efficient shared memory access (avoid bank conflicts), data reuse, and local memory read.&lt;/li&gt;
+  &lt;li&gt;The influence of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; on bank conflicts is asymmetric.&lt;/li&gt;
+  &lt;li&gt;To find the optimal combination of &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; is to achieve a balance of efficient shared memory access (avoid bank conflicts), data reuse, and local memory read.&lt;/li&gt;
 &lt;/ul&gt;
 
 &lt;p&gt;Pretty tricky. So, what exactly should we do to find the optimal combination? The answer is brute force search. 
-We can pass &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; as arguments to the schedule function, and try all possible combinations to find the optimal one. This can be done easily in TVM:&lt;/p&gt;
+We can pass &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_y&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x&lt;/code&gt; as arguments to the schedule function, and try all possible combinations to find the optimal one. This can be done easily in TVM:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(...,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;&lt;span class=&qu [...]
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot; [...]
     &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;
     &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;
     &lt;span class=&quot;n&quot;&gt;do_schedule_as_usual&lt;/span&gt;
@@ -3875,8 +4398,8 @@ We can pass &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;n
 
 &lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inf&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;all_possible_combinations&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(...,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&l [...]
-    &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;test_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(...,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+    &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=& [...]
+    &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;test_depthwise_conv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;...&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
     &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
         &lt;span class=&quot;n&quot;&gt;min_time_cost&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;time_cost&lt;/span&gt;
         &lt;span class=&quot;n&quot;&gt;optimal_combination&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
@@ -3891,22 +4414,22 @@ We can pass &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;n
 &lt;span class=&quot;n&quot;&gt;num_vthread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
 &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
-&lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
-&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
-&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_vthread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/sp [...]
+&lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
+&lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;thread_axis&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;num_thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span [...]
 &lt;span class=&quot;c1&quot;&gt;# split the dimension of height (H in NCHW) twice
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vyi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&l [...]
-&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&q [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vyi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;yi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
 &lt;span class=&quot;c1&quot;&gt;# split the dimension of width (W in NCHW) twice
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vxi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&l [...]
-&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&q [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vxi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt [...]
+&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;xi&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;split&lt;/span&gt;&lt;span class=&qu [...]
 &lt;span class=&quot;c1&quot;&gt;# bind thread and vthread respectively
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
-&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vy&lt;/span&gt; [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_vx&lt;/span&gt;&lt;span clas [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_y&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bind&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;thread_x&lt;/span&gt;&lt;span class= [...]
+&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reorder&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tvy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvx&lt;/span&gt;&lt;span class=& [...]
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;Let’s print the IR to see what vthread does:&lt;/p&gt;
@@ -3957,7 +4480,7 @@ We can pass &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;n
 &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;As we can see, when &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_vthread_y = 2&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_vthread_x = 2&lt;/code&gt;, the 32 x 32 channel is divided into four sub-channels of 16 x 16.
+&lt;p&gt;As we can see, when &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_y = 2&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_x = 2&lt;/code&gt;, the 32 x 32 channel is divided into four sub-channels of 16 x 16.
 Each thread computes four output elements at a time, one element in one sub-channel.&lt;/p&gt;
 
 &lt;p&gt;Below is the result with Filter = [256, 1, 3, 3], stride = [1, 1], blocking_h = 32, blocking_w = 32:&lt;/p&gt;
@@ -4013,7 +4536,7 @@ table th:nth-of-type(2) {
   &lt;/tbody&gt;
 &lt;/table&gt;
 
-&lt;p&gt;Case 2 is faster than case 1. It’s because in case 2 &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_thread_x=8&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;num_vthread_x=4&lt;/code&gt; together ensures that consecutive threads access consecutive memory addresses,
+&lt;p&gt;Case 2 is faster than case 1. It’s because in case 2 &lt;code class=&quot;highlighter-rouge&quot;&gt;num_thread_x=8&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;num_vthread_x=4&lt;/code&gt; together ensures that consecutive threads access consecutive memory addresses,
 thus avoiding bank conflicts, as illustrated below (each color represents one thread’s workload):&lt;/p&gt;
 
 &lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/depthconv_tutorial/vthread_and_strided_pattern.png&quot; alt=&quot;image&quot; width=&quot;90%&quot; /&gt;&lt;/p&gt;
@@ -4077,17 +4600,17 @@ vthread saves additional 5us.&lt;/p&gt;
 &lt;p&gt;One typical optimization we can do in deep learning is operator fusion, that computes multiple operators together in a single kernel without saving intermediate results back to global memory.
 TVM supports that out of the box.&lt;/p&gt;
 
-&lt;p&gt;Consider a common pattern in neural networks: &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; + &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; + &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;relu&lt;/code&gt;. We can fuse the three operators into one, by slightly modifying the original schedule:&lt;/p&gt;
+&lt;p&gt;Consider a common pattern in neural networks: &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;relu&lt;/code&gt;. We can fuse the three operators into one, by slightly modifying the original schedule:&lt;/p&gt;
 
-&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;depthwise_c [...]
-&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;scale_shift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/s [...]
-&lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
+&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;depthwise_c [...]
+&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;scale_shift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/s [...]
+&lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;topi&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relu&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 
 &lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Relu&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# is no longer DepthwiseConv2d
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# this line fuses ScaleShift, explicitly
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_scope&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;local&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# this line fuses DepthwiseConv2d, implicitly
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ScaleShift&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_inline&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# this line fuses ScaleShift, explicitly
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set_scope&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;local&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;c1&qu [...]
 &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;schedule&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# schedule for Output the same way we schedule for DepthwiseConv2d as discussed above
-&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;].&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_at&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;],&l [...]
+&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;DepthwiseConv2d&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;compute_at&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;s&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Output&lt; [...]
 &lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
 &lt;p&gt;It generates IR like this:&lt;/p&gt;
@@ -4120,14 +4643,14 @@ TVM supports that out of the box.&lt;/p&gt;
 &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;As we can see, each thread computes &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;relu&lt;/code&gt; before writing the result of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; to global memory. The fused operator is as fast as single &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;depthwise_conv2 [...]
+&lt;p&gt;As we can see, each thread computes &lt;code class=&quot;highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; and &lt;code class=&quot;highlighter-rouge&quot;&gt;relu&lt;/code&gt; before writing the result of &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; to global memory. The fused operator is as fast as single &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;.
 Below is the result with Input = [1, 256, 96, 96], Filter = [256, 1, 3, 3], stride = [1, 1], padding = ‘SAME’:&lt;/p&gt;
 
 &lt;ul&gt;
-  &lt;li&gt;tf-1.2 &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;: 251.6 us&lt;/li&gt;
-  &lt;li&gt;tf-1.2 &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; + &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; + &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;relu&lt;/code&gt; (separate): 419.9 us&lt;/li&gt;
-  &lt;li&gt;TVM &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;: 90.9 us&lt;/li&gt;
-  &lt;li&gt;TVM &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;depthwise_conv2d + scale_shift + relu&lt;/code&gt; (fused): 91.5 us&lt;/li&gt;
+  &lt;li&gt;tf-1.2 &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;: 251.6 us&lt;/li&gt;
+  &lt;li&gt;tf-1.2 &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;scale_shift&lt;/code&gt; + &lt;code class=&quot;highlighter-rouge&quot;&gt;relu&lt;/code&gt; (separate): 419.9 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d&lt;/code&gt;: 90.9 us&lt;/li&gt;
+  &lt;li&gt;TVM &lt;code class=&quot;highlighter-rouge&quot;&gt;depthwise_conv2d + scale_shift + relu&lt;/code&gt; (fused): 91.5 us&lt;/li&gt;
 &lt;/ul&gt;
 
 &lt;p&gt;The advantage of operator fusion is obvious.&lt;/p&gt;
@@ -4157,133 +4680,5 @@ He is experiencing a gap year after obtaining a bachelor’s degree in electrica
 </content>
  </entry>
  
- <entry>
-   <title>TVM: An End to End IR Stack for Deploying Deep Learning Workloads on Hardware Platforms</title>
-   <link href="https://tvm.apache.org/2017/08/17/tvm-release-announcement"/>
-   <updated>2017-08-17T12:00:00-07:00</updated>
-   <id>https://tvm.apache.org/2017/08/17/tvm-release-announcement</id>
-   <content type="html">&lt;p style=&quot;text-align: center&quot;&gt;Tianqi Chen(project lead), Thierry Moreau(hardware stack), Ziheng Jiang†(graph compilation), Haichen Shen(gpu optimization)&lt;/p&gt;
-&lt;p style=&quot;text-align: center&quot;&gt;Advisors: Luis Ceze, Carlos Guestrin, Arvind Krishnamurthy&lt;/p&gt;
-&lt;p style=&quot;text-align: center&quot;&gt;Paul G. Allen School of Computer Science &amp;amp; Engineering, University of Washington&lt;/p&gt;
-&lt;p style=&quot;text-align: center&quot;&gt;DMLC open-source community&lt;/p&gt;
-&lt;p style=&quot;text-align: center&quot;&gt;†Amazon Web Service&lt;/p&gt;
-
-&lt;p&gt;Deep learning has become ubiquitous and indispensable.  Part of this revolution has been fueled by scalable deep learning systems, such as TensorFlow, MXNet, Caffe and PyTorch. Most existing systems are optimized for a narrow range of server-class GPUs, and require significant effort be deployed on other platforms such as mobile phones, IoT devices and specialized accelerators (FPGAs, ASICs). As the number of deep learning frameworks and hardware backends increase, we propose a  [...]
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/gap.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
-&lt;p&gt;We are excited to announce the launch of TVM as solution to this problem. TVM is  a novel framework that can:&lt;/p&gt;
-
-&lt;ul&gt;
-  &lt;li&gt;Represent and optimize the common deep learning computation workloads for CPUs, GPUs and other specialized hardware&lt;/li&gt;
-  &lt;li&gt;Automatically transform the computation graph to minimize memory utilization, optimize data layout and fuse computation patterns&lt;/li&gt;
-  &lt;li&gt;Provide an end-to-end compilation from existing front-end frameworks down to bare-metal hardware, all the way up to browser executable javascripts.&lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;p&gt;With the help of TVM, we can easily run deep learning workloads on mobile phones, embedded devices and even the browser with little additional effort. TVM also provides a unified optimization framework for deep learning workloads on a multitude of hardware platforms, including specialized accelerators that rely on novel computational primitives.&lt;/p&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/end_to_end_stack.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;We adopt a common philosophy from the compiler community and provide two intermediate representation layers to efficiently lower high-level deep learning algorithms down to a multitude of hardware back-ends.&lt;/p&gt;
-
-&lt;p&gt;In today’s release, we open-source TVM package that contains optimization primitives for x86, ARM, OpenCL, Metal, CUDA and Javascript. We are actively working on adding support for specialized hardware acceleration and Nvidia’s GEMM-optimized Volta architecture.&lt;/p&gt;
-
-&lt;h2 id=&quot;technical-details&quot;&gt;Technical Details&lt;/h2&gt;
-
-&lt;p&gt;The goal of TVM stack is to provide a reusable toolchain to compile high-level neural network descriptions from deep learning framework frontends down to low-level machine code for multiple hardware backends. Take Apache MXNet as a front-end example, the following code snippet demonstrates how can TVM be used to compile a high-level description of a deep learning model to an optimized executable module tailored to the target hardware.&lt;/p&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/code_highlevel.png&quot; alt=&quot;image&quot; width=&quot;800px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;The challenge lies in enabling support for multiple hardware back-ends while keeping compute, memory and energy footprints at their lowest. We borrow wisdom from the compiler community in order to bridge the gap between the multitude of deep learning frameworks and hardware back-ends: we build a two-level intermediate layer composed of NNVM, a high-level intermediate representation (IR) for task scheduling and memory management, and TVM, an expressive low-level IR for optimizing [...]
-
-&lt;p&gt;The first level of the stack is a computational graph based representation. A computation graph is a directed acyclic graph that represent computation as nodes and dataflow dependency as edges. This representation is very powerful: it allows us to bake operation attributes into the computation graph and specify transformation rules to iteratively optimize a computation graph. This is a common approach taken by most of the existing deep learning frameworks, including the NNVM gra [...]
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/computational_graph.png&quot; alt=&quot;image&quot; width=&quot;300px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;A lot of powerful optimizations can be supported by the graph optimization framework. For example, we provided a sublinear memory optimization functionality that allows user to train 1000 layers of ImageNet ResNet on a single GPU.&lt;/p&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/nnvm_gap.png&quot; alt=&quot;image&quot; width=&quot;512px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;However, we find that the computational graph based IR alone is not enough to solve the challenge of supporting different hardware backends. The reason being that a single graph operator like convolution or matrix multiplication may be mapped and optimized in very different ways for each hardware back-end. These hardware-specific optimizations can vary drastically in terms of memory layout, parallelization threading patterns, caching access patterns and choice of hardware primit [...]
-
-&lt;p&gt;We build a low level representation to solve this problem. This representation is based on index formula, with additional support for recurrence computation.&lt;/p&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_dsl.png&quot; alt=&quot;image&quot; width=&quot;700px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;The low level IR adopt principles from existing image processing languages like Halide or darkroom to formulate an expressive deep learning DSL. TVM builds low level optimizations inspired by loop transformation tools like loopy and polyhedra-based analysis. We also draw inspiration from the dataflow description languages used in deep learning frameworks like MXNet, TensorFlow, Theano. The algorithms described in TVM are then processed in a scheduling phase to apply transformati [...]
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_backends.png&quot; alt=&quot;image&quot; width=&quot;600px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;TVM includes standard transformation primitives commonly found in CPU optimization frameworks. More importantly, TVM incorporates novel optimization primitives targeted at GPUs, by exploiting thread cooperation patterns, data layout transformations and powerful new compute primitives. Using TVM in combination with NNVM provides an rich opportunity to optimize deep learning workloads across the software stack, enabling joint compute graph-level and operator-level optimizations.&l [...]
-
-&lt;h3 id=&quot;multi-language-and-platform-support&quot;&gt;Multi-language and Platform Support&lt;/h3&gt;
-
-&lt;p&gt;One of the many strength of TVM lies in its rich support for multiple platforms and languages. We present two components of the framework: the compiler stack which contains complete optimization libraries to produce optimized machine code, and the runtime which is lightweight and offers the portability required to deploy the compiled modules on different platforms.&lt;/p&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_flexible.png&quot; alt=&quot;image&quot; width=&quot;600px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;TVM currently support a python and C++ interface to the embedded compiler stack. We design the framework with maximum re-use in mind, so that the compiler stack improvements can be applied interchangeably between the Python and C++ components.&lt;/p&gt;
-
-&lt;p&gt;We also provide a lightweight runtime that can directly run TVM compiled code in languages such as javascript, java, python, and c++ on platforms including android, iOS, raspberry pi and web browsers.&lt;/p&gt;
-
-&lt;h3 id=&quot;remote-deployment-and-execution&quot;&gt;Remote Deployment and Execution&lt;/h3&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/tvm_rpc.png&quot; alt=&quot;image&quot; width=&quot;500px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;TVM supports cross-compilation for and testing embedded devices with TVM RPC, a lightweight interface to deploy and execute TVM cross-compiled modules on a remote embedded device. This provides a familiar high-level Python interface to the TVM user to compile, optimize and test deep learning algorithms remotely on various low-level embedded devices.&lt;/p&gt;
-
-&lt;h2 id=&quot;performance&quot;&gt;Performance&lt;/h2&gt;
-
-&lt;p&gt;TVM is still in an early stage of development and we can expect more improvements to come, but we have started to see very promising results, which are discussed in this section.&lt;/p&gt;
-
-&lt;p&gt;TVM gives us the flexibility to explore the rich optimization space of various deep learning kernels, for multiple hardware platforms. For instance, TVM allows us to tailor data layout and fused pattern requirements for the kernels and platforms that we most care about.  Please note that the baseline libraries are created for more general purpose problems, while TVM’s optimized kernels are heavily tuned for the workloads we evaluated via an auto-tuning process. TVM serves as a b [...]
-
-&lt;p&gt;The results listed in this section are still work in progress, and there is room for improvement.&lt;/p&gt;
-
-&lt;h3 id=&quot;raspberry-pi&quot;&gt;Raspberry Pi&lt;/h3&gt;
-
-&lt;p&gt;In the first part of result we compared the TVM CPU schedule to nnpack on a raspberry Pi 3B executing a resnet workload. Due to limited time, we utilized TVM to implemented the direct convolution while nnpack was used to perform winograd conv for 3x3 kernels.&lt;/p&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/resnet_rasp.png&quot; alt=&quot;image&quot; width=&quot;500px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;We can find that with TVM’s autotuned kernels, we can obtain performance similar to the hand-optimized kernels in nnpack for the raspberry pi experiments.&lt;/p&gt;
-
-&lt;h3 id=&quot;gpu-results&quot;&gt;GPU Results&lt;/h3&gt;
-&lt;p&gt;&lt;strong&gt;Author Credit&lt;/strong&gt; These benchmarks and corresponding schedule optimizations are created by our contributors:  &lt;a href=&quot;http://www.ece.ucdavis.edu/~laurawly/&quot;&gt;Leyuan Wang&lt;/a&gt; (AWS / UCDavis), &lt;a href=&quot;http://huyuwei.github.io&quot;&gt;Yuwei Hu&lt;/a&gt;(TuSimple) and Weitang Liu (AWS/ UCDavis). They deserve all the credits.&lt;/p&gt;
-
-&lt;p&gt;As a proof of concept, we created an end to end compilation pipeline that can compile MxNet models down to TVM execution graphs. We apply optimization within and between graph nodes by automatically fusing operators together and letting TVM generate the fused kernels.
-We benchmarked the mobilenet ImageNet workload, and discuss the results below:&lt;/p&gt;
-
-&lt;p style=&quot;text-align: center&quot;&gt;&lt;img src=&quot;/images/release/gpu_mobilenet.png&quot; alt=&quot;image&quot; width=&quot;600px&quot; /&gt;&lt;/p&gt;
-
-&lt;p&gt;We can find that TVM can outperform our baseline method in terms of speed. More interestingly, the kernel fusion brings additional speedup. It is worth mentioning that TVM generates all the optimized GPU kernels on its own without relying on external libraries like CuDNN.&lt;/p&gt;
-
-&lt;p&gt;We are working on more experiments and will release new results as they are obtained.&lt;/p&gt;
-
-&lt;h2 id=&quot;open-source-effort&quot;&gt;Open Source Effort&lt;/h2&gt;
-&lt;p&gt;TVM started as a research project of Paul G. Allen School Computer Science and Engineering at University of Washington. The TVM stack is designed to support &lt;a href=&quot;https://github.com/dmlc/dlpack&quot;&gt;DLPack&lt;/a&gt;, a consensus on tensor data structure by multiple major deep learning frameworks. We have received early contributions from from UW, AWS, Qiho 360, Facebook, HKUST, TuSimple, UCDavis, SJTU as well members of DMLC open-source community and DLPack initia [...]
-
-&lt;h2 id=&quot;acknowledgement&quot;&gt;Acknowledgement&lt;/h2&gt;
-&lt;p&gt;This project wouldn’t become possible without our early contributors. We would like to thank Yizhi Liu(Qihoo 360), Yuwei Hu(TuSimple),
-Xingjian Shi(HKUST), Leyuan Wang(AWS/UCDavis), Nicolas Vasilache(Facebook), Jian Weng(UCLA), Weitang Liu(AWS/UCDavis), Edward Z. Yang(Facebook),
-Lianmin Zheng(SJTU), Qiao Zhang(UW), William Moses(Facebook/MIT) and Hu Shiwen. The author would also like to thank Xianyi Zhang(PerfXLab) for helpful discussions.&lt;/p&gt;
-
-&lt;p&gt;We also learnt a lot from the following projects when building TVM.&lt;/p&gt;
-&lt;ul&gt;
-  &lt;li&gt;&lt;a href=&quot;https://github.com/halide/Halide&quot;&gt;Halide&lt;/a&gt;: TVM uses &lt;a href=&quot;https://github.com/dmlc/HalideIR&quot;&gt;HalideIR&lt;/a&gt; as data structure for
-arithematic simplification and low level lowering. HalideIR is derived from Halide.
-We also learns from Halide when implementing the lowering pipeline in TVM.&lt;/li&gt;
-  &lt;li&gt;&lt;a href=&quot;https://github.com/inducer/loopy&quot;&gt;Loopy&lt;/a&gt;: use of integer set analysis and its loop transformation primitives.&lt;/li&gt;
-  &lt;li&gt;&lt;a href=&quot;https://github.com/Theano/Theano&quot;&gt;Theano&lt;/a&gt;: the design inspiration of symbolic scan operator for recurrence.&lt;/li&gt;
-&lt;/ul&gt;
-
-&lt;h2 id=&quot;source-code&quot;&gt;Source code&lt;/h2&gt;
-&lt;ul&gt;
-  &lt;li&gt;Github page can be found here: &lt;a href=&quot;https://github.com/dmlc/tvm&quot;&gt;https://github.com/dmlc/tvm&lt;/a&gt;&lt;/li&gt;
-  &lt;li&gt;TVM is &lt;a href=&quot;https://github.com/dmlc/dlpack&quot;&gt;DLPack&lt;/a&gt; compatible, which makes it easy to support frameworks
-that adopts the standard, such as MXNet, PyTorch, Caffe2 and tiny-dnn.&lt;/li&gt;
-&lt;/ul&gt;
-</content>
- </entry>
- 
  
 </feed>
diff --git a/blog.html b/blog.html
index c54638e..eff9ea6 100644
--- a/blog.html
+++ b/blog.html
@@ -156,6 +156,16 @@
 
 <li>
   <span>
+    <a class="post-link" href="/2020/07/14/bert-pytorch-tvm">Bridging PyTorch and TVM</a>
+  </span>
+  </br>
+  <span>
+    Jul 14, 2020
+  </span>
+</li>
+
+<li>
+  <span>
     <a class="post-link" href="/2020/06/04/tinyml-how-tvm-is-taming-tiny">TinyML - How TVM is Taming Tiny</a>
   </span>
   </br>
diff --git a/download.html b/download.html
index a8b086e..66c8cea 100644
--- a/download.html
+++ b/download.html
@@ -214,7 +214,7 @@ Choose your flavor of download from the following links:</p>
 <div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nv">$ </span>certUtil <span class="nt">-hashfile</span> pathToFileToCheck
 </code></pre></div></div>
 
-<p>Unix-like systems (and macOS) will have a utility called <code class="language-plaintext highlighter-rouge">md5</code>, <code class="language-plaintext highlighter-rouge">md5sum</code> or <code class="language-plaintext highlighter-rouge">shasum</code>.</p>
+<p>Unix-like systems (and macOS) will have a utility called <code class="highlighter-rouge">md5</code>, <code class="highlighter-rouge">md5sum</code> or <code class="highlighter-rouge">shasum</code>.</p>
 
 
       </div>
diff --git a/images/bert-pytorch/bert-tvm_49_0.svg b/images/bert-pytorch/bert-tvm_49_0.svg
new file mode 100644
index 0000000..35b0aee
--- /dev/null
+++ b/images/bert-pytorch/bert-tvm_49_0.svg
@@ -0,0 +1,691 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="2140pt" height="1916pt"
+ viewBox="0.00 0.00 2140.22 1916.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1912)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1912 2136.22,-1912 2136.22,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="1238.18" cy="-1746" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="1238.18" y="-1742.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 16 -->
+<g id="node9" class="node">
+<title>16</title>
+<polygon fill="none" stroke="black" points="1044.18,-1692 692.18,-1692 692.18,-1656 1044.18,-1656 1044.18,-1692"/>
+<text text-anchor="middle" x="868.18" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;16 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;16</title>
+<path fill="none" stroke="black" d="M1158.81,-1729.98C1102.99,-1719.42 1027.88,-1705.21 968,-1693.89"/>
+<polygon fill="black" stroke="black" points="968.63,-1690.44 958.15,-1692.02 967.33,-1697.32 968.63,-1690.44"/>
+</g>
+<!-- 26 -->
+<g id="node19" class="node">
+<title>26</title>
+<polygon fill="none" stroke="black" points="1414.18,-1692 1062.18,-1692 1062.18,-1656 1414.18,-1656 1414.18,-1692"/>
+<text text-anchor="middle" x="1238.18" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;26 -->
+<g id="edge13" class="edge">
+<title>0&#45;&gt;26</title>
+<path fill="none" stroke="black" d="M1238.18,-1727.7C1238.18,-1719.98 1238.18,-1710.71 1238.18,-1702.11"/>
+<polygon fill="black" stroke="black" points="1241.68,-1702.1 1238.18,-1692.1 1234.68,-1702.1 1241.68,-1702.1"/>
+</g>
+<!-- 47 -->
+<g id="node39" class="node">
+<title>47</title>
+<polygon fill="none" stroke="black" points="1784.18,-1692 1432.18,-1692 1432.18,-1656 1784.18,-1656 1784.18,-1692"/>
+<text text-anchor="middle" x="1608.18" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;47 -->
+<g id="edge37" class="edge">
+<title>0&#45;&gt;47</title>
+<path fill="none" stroke="black" d="M1317.56,-1729.98C1373.37,-1719.42 1448.48,-1705.21 1508.36,-1693.89"/>
+<polygon fill="black" stroke="black" points="1509.04,-1697.32 1518.21,-1692.02 1507.74,-1690.44 1509.04,-1697.32"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="200.18" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="200.18" y="-1886.3" font-family="Times,serif" font-size="14.00">query.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 17 -->
+<g id="node10" class="node">
+<title>17</title>
+<polygon fill="none" stroke="black" points="310.68,-1836 117.68,-1836 117.68,-1800 310.68,-1800 310.68,-1836"/>
+<text text-anchor="middle" x="214.18" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 1&#45;&gt;17 -->
+<g id="edge2" class="edge">
+<title>1&#45;&gt;17</title>
+<path fill="none" stroke="black" d="M203.64,-1871.7C205.19,-1863.98 207.04,-1854.71 208.76,-1846.11"/>
+<polygon fill="black" stroke="black" points="212.23,-1846.6 210.76,-1836.1 205.37,-1845.22 212.23,-1846.6"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<ellipse fill="none" stroke="black" cx="184.18" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="184.18" y="-1526.3" font-family="Times,serif" font-size="14.00">query.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 22 -->
+<g id="node15" class="node">
+<title>22</title>
+<polygon fill="none" stroke="black" points="578.68,-1476 507.68,-1476 507.68,-1440 578.68,-1440 578.68,-1476"/>
+<text text-anchor="middle" x="543.18" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 2&#45;&gt;22 -->
+<g id="edge9" class="edge">
+<title>2&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M261.2,-1513.98C333.06,-1499.97 437.89,-1479.53 497.41,-1467.92"/>
+<polygon fill="black" stroke="black" points="498.13,-1471.35 507.27,-1466 496.79,-1464.48 498.13,-1471.35"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<ellipse fill="none" stroke="black" cx="608.18" cy="-1890" rx="189.57" ry="18"/>
+<text text-anchor="middle" x="608.18" y="-1886.3" font-family="Times,serif" font-size="14.00">key.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 27 -->
+<g id="node20" class="node">
+<title>27</title>
+<polygon fill="none" stroke="black" points="700.68,-1836 507.68,-1836 507.68,-1800 700.68,-1800 700.68,-1836"/>
+<text text-anchor="middle" x="604.18" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 3&#45;&gt;27 -->
+<g id="edge14" class="edge">
+<title>3&#45;&gt;27</title>
+<path fill="none" stroke="black" d="M607.19,-1871.7C606.75,-1863.98 606.22,-1854.71 605.73,-1846.11"/>
+<polygon fill="black" stroke="black" points="609.22,-1845.89 605.16,-1836.1 602.24,-1846.29 609.22,-1845.89"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<ellipse fill="none" stroke="black" cx="891.18" cy="-1530" rx="156.77" ry="18"/>
+<text text-anchor="middle" x="891.18" y="-1526.3" font-family="Times,serif" font-size="14.00">key.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 32 -->
+<g id="node25" class="node">
+<title>32</title>
+<polygon fill="none" stroke="black" points="1033.68,-1476 962.68,-1476 962.68,-1440 1033.68,-1440 1033.68,-1476"/>
+<text text-anchor="middle" x="998.18" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 4&#45;&gt;32 -->
+<g id="edge21" class="edge">
+<title>4&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M916.81,-1512.23C930.77,-1503.1 948.23,-1491.68 963.4,-1481.76"/>
+<polygon fill="black" stroke="black" points="965.57,-1484.52 972.02,-1476.12 961.73,-1478.66 965.57,-1484.52"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<ellipse fill="none" stroke="black" cx="1350.18" cy="-882" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1350.18" y="-878.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 42 -->
+<g id="node34" class="node">
+<title>42</title>
+<polygon fill="none" stroke="black" points="1385.68,-828 1314.68,-828 1314.68,-792 1385.68,-792 1385.68,-828"/>
+<text text-anchor="middle" x="1350.18" y="-806.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 5&#45;&gt;42 -->
+<g id="edge32" class="edge">
+<title>5&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1350.18,-863.7C1350.18,-855.98 1350.18,-846.71 1350.18,-838.11"/>
+<polygon fill="black" stroke="black" points="1353.68,-838.1 1350.18,-828.1 1346.68,-838.1 1353.68,-838.1"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<ellipse fill="none" stroke="black" cx="1908.18" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="1908.18" y="-1886.3" font-family="Times,serif" font-size="14.00">value.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 48 -->
+<g id="node40" class="node">
+<title>48</title>
+<polygon fill="none" stroke="black" points="2004.68,-1836 1811.68,-1836 1811.68,-1800 2004.68,-1800 2004.68,-1836"/>
+<text text-anchor="middle" x="1908.18" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 6&#45;&gt;48 -->
+<g id="edge38" class="edge">
+<title>6&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1908.18,-1871.7C1908.18,-1863.98 1908.18,-1854.71 1908.18,-1846.11"/>
+<polygon fill="black" stroke="black" points="1911.68,-1846.1 1908.18,-1836.1 1904.68,-1846.1 1911.68,-1846.1"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<ellipse fill="none" stroke="black" cx="1965.18" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="1965.18" y="-1526.3" font-family="Times,serif" font-size="14.00">value.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 53 -->
+<g id="node45" class="node">
+<title>53</title>
+<polygon fill="none" stroke="black" points="1641.68,-1476 1570.68,-1476 1570.68,-1440 1641.68,-1440 1641.68,-1476"/>
+<text text-anchor="middle" x="1606.18" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 7&#45;&gt;53 -->
+<g id="edge45" class="edge">
+<title>7&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1888.17,-1513.98C1816.3,-1499.97 1711.47,-1479.53 1651.95,-1467.92"/>
+<polygon fill="black" stroke="black" points="1652.58,-1464.48 1642.09,-1466 1651.24,-1471.35 1652.58,-1464.48"/>
+</g>
+<!-- 20 -->
+<g id="node13" class="node">
+<title>20</title>
+<polygon fill="none" stroke="black" points="627.68,-1620 458.68,-1620 458.68,-1584 627.68,-1584 627.68,-1620"/>
+<text text-anchor="middle" x="543.18" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;20 -->
+<g id="edge5" class="edge">
+<title>16&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M789.09,-1655.97C741.72,-1645.76 681.37,-1632.76 632.31,-1622.2"/>
+<polygon fill="black" stroke="black" points="632.86,-1618.73 622.34,-1620.05 631.38,-1625.58 632.86,-1618.73"/>
+</g>
+<!-- 18 -->
+<g id="node11" class="node">
+<title>18</title>
+<polygon fill="none" stroke="black" points="401.68,-1764 40.68,-1764 40.68,-1728 401.68,-1728 401.68,-1764"/>
+<text text-anchor="middle" x="221.18" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 17&#45;&gt;18 -->
+<g id="edge3" class="edge">
+<title>17&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M215.91,-1799.7C216.68,-1791.98 217.61,-1782.71 218.47,-1774.11"/>
+<polygon fill="black" stroke="black" points="221.96,-1774.4 219.47,-1764.1 214.99,-1773.71 221.96,-1774.4"/>
+</g>
+<!-- 19 -->
+<g id="node12" class="node">
+<title>19</title>
+<polygon fill="none" stroke="black" points="405.68,-1692 194.68,-1692 194.68,-1656 405.68,-1656 405.68,-1692"/>
+<text text-anchor="middle" x="300.18" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 18&#45;&gt;19 -->
+<g id="edge4" class="edge">
+<title>18&#45;&gt;19</title>
+<path fill="none" stroke="black" d="M240.71,-1727.7C250.56,-1718.97 262.67,-1708.24 273.38,-1698.75"/>
+<polygon fill="black" stroke="black" points="275.72,-1701.36 280.88,-1692.1 271.07,-1696.12 275.72,-1701.36"/>
+</g>
+<!-- 19&#45;&gt;20 -->
+<g id="edge6" class="edge">
+<title>19&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M359.32,-1655.97C393.86,-1646.01 437.63,-1633.41 473.8,-1622.99"/>
+<polygon fill="black" stroke="black" points="475.14,-1626.24 483.78,-1620.11 473.21,-1619.52 475.14,-1626.24"/>
+</g>
+<!-- 21 -->
+<g id="node14" class="node">
+<title>21</title>
+<polygon fill="none" stroke="black" points="716.68,-1548 369.68,-1548 369.68,-1512 716.68,-1512 716.68,-1548"/>
+<text text-anchor="middle" x="543.18" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 20&#45;&gt;21 -->
+<g id="edge7" class="edge">
+<title>20&#45;&gt;21</title>
+<path fill="none" stroke="black" d="M543.18,-1583.7C543.18,-1575.98 543.18,-1566.71 543.18,-1558.11"/>
+<polygon fill="black" stroke="black" points="546.68,-1558.1 543.18,-1548.1 539.68,-1558.1 546.68,-1558.1"/>
+</g>
+<!-- 21&#45;&gt;22 -->
+<g id="edge8" class="edge">
+<title>21&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M543.18,-1511.7C543.18,-1503.98 543.18,-1494.71 543.18,-1486.11"/>
+<polygon fill="black" stroke="black" points="546.68,-1486.1 543.18,-1476.1 539.68,-1486.1 546.68,-1486.1"/>
+</g>
+<!-- 23 -->
+<g id="node16" class="node">
+<title>23</title>
+<polygon fill="none" stroke="black" points="797.68,-1404 432.68,-1404 432.68,-1368 797.68,-1368 797.68,-1404"/>
+<text text-anchor="middle" x="615.18" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 22&#45;&gt;23 -->
+<g id="edge10" class="edge">
+<title>22&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M560.98,-1439.7C569.79,-1431.14 580.56,-1420.66 590.18,-1411.3"/>
+<polygon fill="black" stroke="black" points="592.86,-1413.58 597.59,-1404.1 587.98,-1408.57 592.86,-1413.58"/>
+</g>
+<!-- 24 -->
+<g id="node17" class="node">
+<title>24</title>
+<polygon fill="none" stroke="black" points="740.68,-1332 511.68,-1332 511.68,-1296 740.68,-1296 740.68,-1332"/>
+<text text-anchor="middle" x="626.18" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 23&#45;&gt;24 -->
+<g id="edge11" class="edge">
+<title>23&#45;&gt;24</title>
+<path fill="none" stroke="black" d="M617.9,-1367.7C619.11,-1359.98 620.57,-1350.71 621.92,-1342.11"/>
+<polygon fill="black" stroke="black" points="625.4,-1342.53 623.49,-1332.1 618.48,-1341.44 625.4,-1342.53"/>
+</g>
+<!-- 25 -->
+<g id="node18" class="node">
+<title>25</title>
+<polygon fill="none" stroke="black" points="863.68,-1116 520.68,-1116 520.68,-1080 863.68,-1080 863.68,-1116"/>
+<text text-anchor="middle" x="692.18" y="-1094.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 24&#45;&gt;25 -->
+<g id="edge12" class="edge">
+<title>24&#45;&gt;25</title>
+<path fill="none" stroke="black" d="M631.47,-1295.85C642.94,-1258.68 670.15,-1170.44 683.91,-1125.82"/>
+<polygon fill="black" stroke="black" points="687.26,-1126.82 686.87,-1116.23 680.58,-1124.76 687.26,-1126.82"/>
+</g>
+<!-- 38 -->
+<g id="node31" class="node">
+<title>38</title>
+<polygon fill="none" stroke="black" points="1082.68,-1044 913.68,-1044 913.68,-1008 1082.68,-1008 1082.68,-1044"/>
+<text text-anchor="middle" x="998.18" y="-1022.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 25&#45;&gt;38 -->
+<g id="edge27" class="edge">
+<title>25&#45;&gt;38</title>
+<path fill="none" stroke="black" d="M766.65,-1079.97C811.06,-1069.8 867.6,-1056.87 913.68,-1046.33"/>
+<polygon fill="black" stroke="black" points="914.68,-1049.69 923.65,-1044.05 913.12,-1042.87 914.68,-1049.69"/>
+</g>
+<!-- 30 -->
+<g id="node23" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="1322.68,-1620 1153.68,-1620 1153.68,-1584 1322.68,-1584 1322.68,-1620"/>
+<text text-anchor="middle" x="1238.18" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 26&#45;&gt;30 -->
+<g id="edge17" class="edge">
+<title>26&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M1238.18,-1655.7C1238.18,-1647.98 1238.18,-1638.71 1238.18,-1630.11"/>
+<polygon fill="black" stroke="black" points="1241.68,-1630.1 1238.18,-1620.1 1234.68,-1630.1 1241.68,-1630.1"/>
+</g>
+<!-- 28 -->
+<g id="node21" class="node">
+<title>28</title>
+<polygon fill="none" stroke="black" points="782.68,-1764 421.68,-1764 421.68,-1728 782.68,-1728 782.68,-1764"/>
+<text text-anchor="middle" x="602.18" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 27&#45;&gt;28 -->
+<g id="edge15" class="edge">
+<title>27&#45;&gt;28</title>
+<path fill="none" stroke="black" d="M603.69,-1799.7C603.47,-1791.98 603.2,-1782.71 602.96,-1774.11"/>
+<polygon fill="black" stroke="black" points="606.46,-1774 602.67,-1764.1 599.46,-1774.2 606.46,-1774"/>
+</g>
+<!-- 29 -->
+<g id="node22" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="673.68,-1692 462.68,-1692 462.68,-1656 673.68,-1656 673.68,-1692"/>
+<text text-anchor="middle" x="568.18" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 28&#45;&gt;29 -->
+<g id="edge16" class="edge">
+<title>28&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M593.78,-1727.7C589.91,-1719.73 585.23,-1710.1 580.94,-1701.26"/>
+<polygon fill="black" stroke="black" points="584.01,-1699.57 576.49,-1692.1 577.71,-1702.63 584.01,-1699.57"/>
+</g>
+<!-- 29&#45;&gt;30 -->
+<g id="edge18" class="edge">
+<title>29&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M673.75,-1657.18C676.93,-1656.77 680.08,-1656.38 683.18,-1656 844.38,-1636.4 1032.65,-1619.73 1143.28,-1610.56"/>
+<polygon fill="black" stroke="black" points="1143.73,-1614.04 1153.41,-1609.73 1143.16,-1607.06 1143.73,-1614.04"/>
+</g>
+<!-- 31 -->
+<g id="node24" class="node">
+<title>31</title>
+<polygon fill="none" stroke="black" points="1412.68,-1548 1065.68,-1548 1065.68,-1512 1412.68,-1512 1412.68,-1548"/>
+<text text-anchor="middle" x="1239.18" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 30&#45;&gt;31 -->
+<g id="edge19" class="edge">
+<title>30&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M1238.43,-1583.7C1238.54,-1575.98 1238.67,-1566.71 1238.79,-1558.11"/>
+<polygon fill="black" stroke="black" points="1242.29,-1558.15 1238.94,-1548.1 1235.3,-1558.05 1242.29,-1558.15"/>
+</g>
+<!-- 31&#45;&gt;32 -->
+<g id="edge20" class="edge">
+<title>31&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M1180.54,-1511.97C1138.46,-1499.74 1082.57,-1483.51 1043.81,-1472.25"/>
+<polygon fill="black" stroke="black" points="1044.43,-1468.79 1033.85,-1469.36 1042.48,-1475.51 1044.43,-1468.79"/>
+</g>
+<!-- 33 -->
+<g id="node26" class="node">
+<title>33</title>
+<polygon fill="none" stroke="black" points="1180.68,-1404 815.68,-1404 815.68,-1368 1180.68,-1368 1180.68,-1404"/>
+<text text-anchor="middle" x="998.18" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 32&#45;&gt;33 -->
+<g id="edge22" class="edge">
+<title>32&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M998.18,-1439.7C998.18,-1431.98 998.18,-1422.71 998.18,-1414.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1414.1 998.18,-1404.1 994.68,-1414.1 1001.68,-1414.1"/>
+</g>
+<!-- 34 -->
+<g id="node27" class="node">
+<title>34</title>
+<polygon fill="none" stroke="black" points="1112.68,-1332 883.68,-1332 883.68,-1296 1112.68,-1296 1112.68,-1332"/>
+<text text-anchor="middle" x="998.18" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 33&#45;&gt;34 -->
+<g id="edge23" class="edge">
+<title>33&#45;&gt;34</title>
+<path fill="none" stroke="black" d="M998.18,-1367.7C998.18,-1359.98 998.18,-1350.71 998.18,-1342.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1342.1 998.18,-1332.1 994.68,-1342.1 1001.68,-1342.1"/>
+</g>
+<!-- 35 -->
+<g id="node28" class="node">
+<title>35</title>
+<polygon fill="none" stroke="black" points="1112.68,-1260 883.68,-1260 883.68,-1224 1112.68,-1224 1112.68,-1260"/>
+<text text-anchor="middle" x="998.18" y="-1238.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 1, 3, 2])</text>
+</g>
+<!-- 34&#45;&gt;35 -->
+<g id="edge24" class="edge">
+<title>34&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M998.18,-1295.7C998.18,-1287.98 998.18,-1278.71 998.18,-1270.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1270.1 998.18,-1260.1 994.68,-1270.1 1001.68,-1270.1"/>
+</g>
+<!-- 36 -->
+<g id="node29" class="node">
+<title>36</title>
+<polygon fill="none" stroke="black" points="1169.68,-1188 826.68,-1188 826.68,-1152 1169.68,-1152 1169.68,-1188"/>
+<text text-anchor="middle" x="998.18" y="-1166.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 64, 14], reverse=0)</text>
+</g>
+<!-- 35&#45;&gt;36 -->
+<g id="edge25" class="edge">
+<title>35&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M998.18,-1223.7C998.18,-1215.98 998.18,-1206.71 998.18,-1198.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1198.1 998.18,-1188.1 994.68,-1198.1 1001.68,-1198.1"/>
+</g>
+<!-- 37 -->
+<g id="node30" class="node">
+<title>37</title>
+<polygon fill="none" stroke="black" points="1103.68,-1116 892.68,-1116 892.68,-1080 1103.68,-1080 1103.68,-1116"/>
+<text text-anchor="middle" x="998.18" y="-1094.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 36&#45;&gt;37 -->
+<g id="edge26" class="edge">
+<title>36&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M998.18,-1151.7C998.18,-1143.98 998.18,-1134.71 998.18,-1126.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1126.1 998.18,-1116.1 994.68,-1126.1 1001.68,-1126.1"/>
+</g>
+<!-- 37&#45;&gt;38 -->
+<g id="edge28" class="edge">
+<title>37&#45;&gt;38</title>
+<path fill="none" stroke="black" d="M998.18,-1079.7C998.18,-1071.98 998.18,-1062.71 998.18,-1054.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1054.1 998.18,-1044.1 994.68,-1054.1 1001.68,-1054.1"/>
+</g>
+<!-- 39 -->
+<g id="node32" class="node">
+<title>39</title>
+<polygon fill="none" stroke="black" points="1216.68,-972 851.68,-972 851.68,-936 1216.68,-936 1216.68,-972"/>
+<text text-anchor="middle" x="1034.18" y="-950.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 14], reverse=0)</text>
+</g>
+<!-- 38&#45;&gt;39 -->
+<g id="edge29" class="edge">
+<title>38&#45;&gt;39</title>
+<path fill="none" stroke="black" d="M1007.08,-1007.7C1011.18,-999.73 1016.13,-990.1 1020.67,-981.26"/>
+<polygon fill="black" stroke="black" points="1023.92,-982.6 1025.39,-972.1 1017.7,-979.4 1023.92,-982.6"/>
+</g>
+<!-- 41 -->
+<g id="node33" class="node">
+<title>41</title>
+<polygon fill="none" stroke="black" points="1105.68,-900 998.68,-900 998.68,-864 1105.68,-864 1105.68,-900"/>
+<text text-anchor="middle" x="1052.18" y="-878.3" font-family="Times,serif" font-size="14.00">divide(·, 8.0)</text>
+</g>
+<!-- 39&#45;&gt;41 -->
+<g id="edge30" class="edge">
+<title>39&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1038.63,-935.7C1040.64,-927.9 1043.05,-918.51 1045.28,-909.83"/>
+<polygon fill="black" stroke="black" points="1048.68,-910.66 1047.78,-900.1 1041.9,-908.92 1048.68,-910.66"/>
+</g>
+<!-- 41&#45;&gt;42 -->
+<g id="edge31" class="edge">
+<title>41&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1106.06,-868.07C1111.84,-866.69 1117.63,-865.31 1123.18,-864 1185.98,-849.15 1258.61,-832.24 1304.46,-821.6"/>
+<polygon fill="black" stroke="black" points="1305.44,-824.96 1314.39,-819.29 1303.86,-818.14 1305.44,-824.96"/>
+</g>
+<!-- 43 -->
+<g id="node35" class="node">
+<title>43</title>
+<polygon fill="none" stroke="black" points="1445.18,-756 1271.18,-756 1271.18,-720 1445.18,-720 1445.18,-756"/>
+<text text-anchor="middle" x="1358.18" y="-734.3" font-family="Times,serif" font-size="14.00">nn.softmax(·| axis=&#45;1)</text>
+</g>
+<!-- 42&#45;&gt;43 -->
+<g id="edge33" class="edge">
+<title>42&#45;&gt;43</title>
+<path fill="none" stroke="black" d="M1352.16,-791.7C1353.04,-783.98 1354.1,-774.71 1355.08,-766.11"/>
+<polygon fill="black" stroke="black" points="1358.57,-766.44 1356.23,-756.1 1351.61,-765.64 1358.57,-766.44"/>
+</g>
+<!-- 44 -->
+<g id="node36" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="1472.68,-684 1289.68,-684 1289.68,-648 1472.68,-648 1472.68,-684"/>
+<text text-anchor="middle" x="1381.18" y="-662.3" font-family="Times,serif" font-size="14.00">nn.dropout(·| rate=0.1)</text>
+</g>
+<!-- 43&#45;&gt;44 -->
+<g id="edge34" class="edge">
+<title>43&#45;&gt;44</title>
+<path fill="none" stroke="black" d="M1363.87,-719.7C1366.43,-711.9 1369.52,-702.51 1372.37,-693.83"/>
+<polygon fill="black" stroke="black" points="1375.77,-694.7 1375.56,-684.1 1369.12,-692.51 1375.77,-694.7"/>
+</g>
+<!-- 45 -->
+<g id="node37" class="node">
+<title>45</title>
+<polygon fill="none" stroke="black" points="1477.18,-612 1309.18,-612 1309.18,-576 1477.18,-576 1477.18,-612"/>
+<text text-anchor="middle" x="1393.18" y="-590.3" font-family="Times,serif" font-size="14.00">TupleGetItem(idx=0)</text>
+</g>
+<!-- 44&#45;&gt;45 -->
+<g id="edge35" class="edge">
+<title>44&#45;&gt;45</title>
+<path fill="none" stroke="black" d="M1384.15,-647.7C1385.47,-639.98 1387.06,-630.71 1388.53,-622.11"/>
+<polygon fill="black" stroke="black" points="1392.01,-622.55 1390.25,-612.1 1385.11,-621.37 1392.01,-622.55"/>
+</g>
+<!-- 46 -->
+<g id="node38" class="node">
+<title>46</title>
+<polygon fill="none" stroke="black" points="1570.68,-540 1227.68,-540 1227.68,-504 1570.68,-504 1570.68,-540"/>
+<text text-anchor="middle" x="1399.18" y="-518.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 14], reverse=0)</text>
+</g>
+<!-- 45&#45;&gt;46 -->
+<g id="edge36" class="edge">
+<title>45&#45;&gt;46</title>
+<path fill="none" stroke="black" d="M1394.67,-575.7C1395.33,-567.98 1396.12,-558.71 1396.86,-550.11"/>
+<polygon fill="black" stroke="black" points="1400.35,-550.37 1397.72,-540.1 1393.37,-549.77 1400.35,-550.37"/>
+</g>
+<!-- 58 -->
+<g id="node50" class="node">
+<title>58</title>
+<polygon fill="none" stroke="black" points="1582.68,-468 1413.68,-468 1413.68,-432 1582.68,-432 1582.68,-468"/>
+<text text-anchor="middle" x="1498.18" y="-446.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 46&#45;&gt;58 -->
+<g id="edge50" class="edge">
+<title>46&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1423.4,-503.88C1436.1,-494.89 1451.85,-483.76 1465.61,-474.03"/>
+<polygon fill="black" stroke="black" points="1467.73,-476.82 1473.87,-468.19 1463.69,-471.11 1467.73,-476.82"/>
+</g>
+<!-- 51 -->
+<g id="node43" class="node">
+<title>51</title>
+<polygon fill="none" stroke="black" points="1692.68,-1620 1523.68,-1620 1523.68,-1584 1692.68,-1584 1692.68,-1620"/>
+<text text-anchor="middle" x="1608.18" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 47&#45;&gt;51 -->
+<g id="edge41" class="edge">
+<title>47&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1608.18,-1655.7C1608.18,-1647.98 1608.18,-1638.71 1608.18,-1630.11"/>
+<polygon fill="black" stroke="black" points="1611.68,-1630.1 1608.18,-1620.1 1604.68,-1630.1 1611.68,-1630.1"/>
+</g>
+<!-- 49 -->
+<g id="node41" class="node">
+<title>49</title>
+<polygon fill="none" stroke="black" points="2088.68,-1764 1727.68,-1764 1727.68,-1728 2088.68,-1728 2088.68,-1764"/>
+<text text-anchor="middle" x="1908.18" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 48&#45;&gt;49 -->
+<g id="edge39" class="edge">
+<title>48&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1908.18,-1799.7C1908.18,-1791.98 1908.18,-1782.71 1908.18,-1774.11"/>
+<polygon fill="black" stroke="black" points="1911.68,-1774.1 1908.18,-1764.1 1904.68,-1774.1 1911.68,-1774.1"/>
+</g>
+<!-- 50 -->
+<g id="node42" class="node">
+<title>50</title>
+<polygon fill="none" stroke="black" points="2013.68,-1692 1802.68,-1692 1802.68,-1656 2013.68,-1656 2013.68,-1692"/>
+<text text-anchor="middle" x="1908.18" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 49&#45;&gt;50 -->
+<g id="edge40" class="edge">
+<title>49&#45;&gt;50</title>
+<path fill="none" stroke="black" d="M1908.18,-1727.7C1908.18,-1719.98 1908.18,-1710.71 1908.18,-1702.11"/>
+<polygon fill="black" stroke="black" points="1911.68,-1702.1 1908.18,-1692.1 1904.68,-1702.1 1911.68,-1702.1"/>
+</g>
+<!-- 50&#45;&gt;51 -->
+<g id="edge42" class="edge">
+<title>50&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1835.18,-1655.97C1791.63,-1645.8 1736.21,-1632.87 1691.03,-1622.33"/>
+<polygon fill="black" stroke="black" points="1691.79,-1618.91 1681.25,-1620.05 1690.2,-1625.73 1691.79,-1618.91"/>
+</g>
+<!-- 52 -->
+<g id="node44" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="1779.68,-1548 1432.68,-1548 1432.68,-1512 1779.68,-1512 1779.68,-1548"/>
+<text text-anchor="middle" x="1606.18" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 51&#45;&gt;52 -->
+<g id="edge43" class="edge">
+<title>51&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M1607.69,-1583.7C1607.47,-1575.98 1607.2,-1566.71 1606.96,-1558.11"/>
+<polygon fill="black" stroke="black" points="1610.46,-1558 1606.67,-1548.1 1603.46,-1558.2 1610.46,-1558"/>
+</g>
+<!-- 52&#45;&gt;53 -->
+<g id="edge44" class="edge">
+<title>52&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1606.18,-1511.7C1606.18,-1503.98 1606.18,-1494.71 1606.18,-1486.11"/>
+<polygon fill="black" stroke="black" points="1609.68,-1486.1 1606.18,-1476.1 1602.68,-1486.1 1609.68,-1486.1"/>
+</g>
+<!-- 54 -->
+<g id="node46" class="node">
+<title>54</title>
+<polygon fill="none" stroke="black" points="1787.68,-1404 1422.68,-1404 1422.68,-1368 1787.68,-1368 1787.68,-1404"/>
+<text text-anchor="middle" x="1605.18" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 53&#45;&gt;54 -->
+<g id="edge46" class="edge">
+<title>53&#45;&gt;54</title>
+<path fill="none" stroke="black" d="M1605.94,-1439.7C1605.82,-1431.98 1605.69,-1422.71 1605.57,-1414.11"/>
+<polygon fill="black" stroke="black" points="1609.07,-1414.05 1605.43,-1404.1 1602.07,-1414.15 1609.07,-1414.05"/>
+</g>
+<!-- 55 -->
+<g id="node47" class="node">
+<title>55</title>
+<polygon fill="none" stroke="black" points="1719.68,-1332 1490.68,-1332 1490.68,-1296 1719.68,-1296 1719.68,-1332"/>
+<text text-anchor="middle" x="1605.18" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 54&#45;&gt;55 -->
+<g id="edge47" class="edge">
+<title>54&#45;&gt;55</title>
+<path fill="none" stroke="black" d="M1605.18,-1367.7C1605.18,-1359.98 1605.18,-1350.71 1605.18,-1342.11"/>
+<polygon fill="black" stroke="black" points="1608.68,-1342.1 1605.18,-1332.1 1601.68,-1342.1 1608.68,-1342.1"/>
+</g>
+<!-- 56 -->
+<g id="node48" class="node">
+<title>56</title>
+<polygon fill="none" stroke="black" points="1775.68,-1260 1432.68,-1260 1432.68,-1224 1775.68,-1224 1775.68,-1260"/>
+<text text-anchor="middle" x="1604.18" y="-1238.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 55&#45;&gt;56 -->
+<g id="edge48" class="edge">
+<title>55&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1604.94,-1295.7C1604.82,-1287.98 1604.69,-1278.71 1604.57,-1270.11"/>
+<polygon fill="black" stroke="black" points="1608.07,-1270.05 1604.43,-1260.1 1601.07,-1270.15 1608.07,-1270.05"/>
+</g>
+<!-- 57 -->
+<g id="node49" class="node">
+<title>57</title>
+<polygon fill="none" stroke="black" points="1706.68,-1044 1495.68,-1044 1495.68,-1008 1706.68,-1008 1706.68,-1044"/>
+<text text-anchor="middle" x="1601.18" y="-1022.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 56&#45;&gt;57 -->
+<g id="edge49" class="edge">
+<title>56&#45;&gt;57</title>
+<path fill="none" stroke="black" d="M1603.94,-1223.85C1603.42,-1186.83 1602.19,-1099.18 1601.57,-1054.39"/>
+<polygon fill="black" stroke="black" points="1605.06,-1054.18 1601.42,-1044.23 1598.06,-1054.28 1605.06,-1054.18"/>
+</g>
+<!-- 57&#45;&gt;58 -->
+<g id="edge51" class="edge">
+<title>57&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1601.18,-1007.95C1601.18,-981.29 1601.18,-928.11 1601.18,-883 1601.18,-883 1601.18,-883 1601.18,-593 1601.18,-552.36 1603.61,-537.21 1580.18,-504 1571.45,-491.62 1558.82,-481.42 1546.08,-473.4"/>
+<polygon fill="black" stroke="black" points="1547.67,-470.27 1537.29,-468.19 1544.1,-476.3 1547.67,-470.27"/>
+</g>
+<!-- 59 -->
+<g id="node51" class="node">
+<title>59</title>
+<polygon fill="none" stroke="black" points="1680.68,-396 1315.68,-396 1315.68,-360 1680.68,-360 1680.68,-396"/>
+<text text-anchor="middle" x="1498.18" y="-374.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 64], reverse=0)</text>
+</g>
+<!-- 58&#45;&gt;59 -->
+<g id="edge52" class="edge">
+<title>58&#45;&gt;59</title>
+<path fill="none" stroke="black" d="M1498.18,-431.7C1498.18,-423.98 1498.18,-414.71 1498.18,-406.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-406.1 1498.18,-396.1 1494.68,-406.1 1501.68,-406.1"/>
+</g>
+<!-- 60 -->
+<g id="node52" class="node">
+<title>60</title>
+<polygon fill="none" stroke="black" points="1612.68,-324 1383.68,-324 1383.68,-288 1612.68,-288 1612.68,-324"/>
+<text text-anchor="middle" x="1498.18" y="-302.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 59&#45;&gt;60 -->
+<g id="edge53" class="edge">
+<title>59&#45;&gt;60</title>
+<path fill="none" stroke="black" d="M1498.18,-359.7C1498.18,-351.98 1498.18,-342.71 1498.18,-334.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-334.1 1498.18,-324.1 1494.68,-334.1 1501.68,-334.1"/>
+</g>
+<!-- 61 -->
+<g id="node53" class="node">
+<title>61</title>
+<polygon fill="none" stroke="black" points="1530.68,-252 1465.68,-252 1465.68,-216 1530.68,-216 1530.68,-252"/>
+<text text-anchor="middle" x="1498.18" y="-230.3" font-family="Times,serif" font-size="14.00">copy(·)</text>
+</g>
+<!-- 60&#45;&gt;61 -->
+<g id="edge54" class="edge">
+<title>60&#45;&gt;61</title>
+<path fill="none" stroke="black" d="M1498.18,-287.7C1498.18,-279.98 1498.18,-270.71 1498.18,-262.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-262.1 1498.18,-252.1 1494.68,-262.1 1501.68,-262.1"/>
+</g>
+<!-- 62 -->
+<g id="node54" class="node">
+<title>62</title>
+<polygon fill="none" stroke="black" points="1671.68,-180 1324.68,-180 1324.68,-144 1671.68,-144 1671.68,-180"/>
+<text text-anchor="middle" x="1498.18" y="-158.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 61&#45;&gt;62 -->
+<g id="edge55" class="edge">
+<title>61&#45;&gt;62</title>
+<path fill="none" stroke="black" d="M1498.18,-215.7C1498.18,-207.98 1498.18,-198.71 1498.18,-190.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-190.1 1498.18,-180.1 1494.68,-190.1 1501.68,-190.1"/>
+</g>
+<!-- 63 -->
+<g id="node55" class="node">
+<title>63</title>
+<polygon fill="none" stroke="black" points="1541.18,-108 1455.18,-108 1455.18,-72 1541.18,-72 1541.18,-108"/>
+<text text-anchor="middle" x="1498.18" y="-86.3" font-family="Times,serif" font-size="14.00">Tuple[...])</text>
+</g>
+<!-- 62&#45;&gt;63 -->
+<g id="edge56" class="edge">
+<title>62&#45;&gt;63</title>
+<path fill="none" stroke="black" d="M1498.18,-143.7C1498.18,-135.98 1498.18,-126.71 1498.18,-118.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-118.1 1498.18,-108.1 1494.68,-118.1 1501.68,-118.1"/>
+</g>
+<!-- 64 -->
+<g id="node56" class="node">
+<title>64</title>
+<polygon fill="none" stroke="black" points="1538.18,-36 1458.18,-36 1458.18,0 1538.18,0 1538.18,-36"/>
+<text text-anchor="middle" x="1498.18" y="-14.3" font-family="Times,serif" font-size="14.00">Function</text>
+</g>
+<!-- 63&#45;&gt;64 -->
+<g id="edge57" class="edge">
+<title>63&#45;&gt;64</title>
+<path fill="none" stroke="black" d="M1498.18,-71.7C1498.18,-63.98 1498.18,-54.71 1498.18,-46.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-46.1 1498.18,-36.1 1494.68,-46.1 1501.68,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert-tvm_54_0.svg b/images/bert-pytorch/bert-tvm_54_0.svg
new file mode 100644
index 0000000..35b0aee
--- /dev/null
+++ b/images/bert-pytorch/bert-tvm_54_0.svg
@@ -0,0 +1,691 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="2140pt" height="1916pt"
+ viewBox="0.00 0.00 2140.22 1916.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1912)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1912 2136.22,-1912 2136.22,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="1238.18" cy="-1746" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="1238.18" y="-1742.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 16 -->
+<g id="node9" class="node">
+<title>16</title>
+<polygon fill="none" stroke="black" points="1044.18,-1692 692.18,-1692 692.18,-1656 1044.18,-1656 1044.18,-1692"/>
+<text text-anchor="middle" x="868.18" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;16 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;16</title>
+<path fill="none" stroke="black" d="M1158.81,-1729.98C1102.99,-1719.42 1027.88,-1705.21 968,-1693.89"/>
+<polygon fill="black" stroke="black" points="968.63,-1690.44 958.15,-1692.02 967.33,-1697.32 968.63,-1690.44"/>
+</g>
+<!-- 26 -->
+<g id="node19" class="node">
+<title>26</title>
+<polygon fill="none" stroke="black" points="1414.18,-1692 1062.18,-1692 1062.18,-1656 1414.18,-1656 1414.18,-1692"/>
+<text text-anchor="middle" x="1238.18" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;26 -->
+<g id="edge13" class="edge">
+<title>0&#45;&gt;26</title>
+<path fill="none" stroke="black" d="M1238.18,-1727.7C1238.18,-1719.98 1238.18,-1710.71 1238.18,-1702.11"/>
+<polygon fill="black" stroke="black" points="1241.68,-1702.1 1238.18,-1692.1 1234.68,-1702.1 1241.68,-1702.1"/>
+</g>
+<!-- 47 -->
+<g id="node39" class="node">
+<title>47</title>
+<polygon fill="none" stroke="black" points="1784.18,-1692 1432.18,-1692 1432.18,-1656 1784.18,-1656 1784.18,-1692"/>
+<text text-anchor="middle" x="1608.18" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;47 -->
+<g id="edge37" class="edge">
+<title>0&#45;&gt;47</title>
+<path fill="none" stroke="black" d="M1317.56,-1729.98C1373.37,-1719.42 1448.48,-1705.21 1508.36,-1693.89"/>
+<polygon fill="black" stroke="black" points="1509.04,-1697.32 1518.21,-1692.02 1507.74,-1690.44 1509.04,-1697.32"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="200.18" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="200.18" y="-1886.3" font-family="Times,serif" font-size="14.00">query.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 17 -->
+<g id="node10" class="node">
+<title>17</title>
+<polygon fill="none" stroke="black" points="310.68,-1836 117.68,-1836 117.68,-1800 310.68,-1800 310.68,-1836"/>
+<text text-anchor="middle" x="214.18" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 1&#45;&gt;17 -->
+<g id="edge2" class="edge">
+<title>1&#45;&gt;17</title>
+<path fill="none" stroke="black" d="M203.64,-1871.7C205.19,-1863.98 207.04,-1854.71 208.76,-1846.11"/>
+<polygon fill="black" stroke="black" points="212.23,-1846.6 210.76,-1836.1 205.37,-1845.22 212.23,-1846.6"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<ellipse fill="none" stroke="black" cx="184.18" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="184.18" y="-1526.3" font-family="Times,serif" font-size="14.00">query.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 22 -->
+<g id="node15" class="node">
+<title>22</title>
+<polygon fill="none" stroke="black" points="578.68,-1476 507.68,-1476 507.68,-1440 578.68,-1440 578.68,-1476"/>
+<text text-anchor="middle" x="543.18" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 2&#45;&gt;22 -->
+<g id="edge9" class="edge">
+<title>2&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M261.2,-1513.98C333.06,-1499.97 437.89,-1479.53 497.41,-1467.92"/>
+<polygon fill="black" stroke="black" points="498.13,-1471.35 507.27,-1466 496.79,-1464.48 498.13,-1471.35"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<ellipse fill="none" stroke="black" cx="608.18" cy="-1890" rx="189.57" ry="18"/>
+<text text-anchor="middle" x="608.18" y="-1886.3" font-family="Times,serif" font-size="14.00">key.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 27 -->
+<g id="node20" class="node">
+<title>27</title>
+<polygon fill="none" stroke="black" points="700.68,-1836 507.68,-1836 507.68,-1800 700.68,-1800 700.68,-1836"/>
+<text text-anchor="middle" x="604.18" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 3&#45;&gt;27 -->
+<g id="edge14" class="edge">
+<title>3&#45;&gt;27</title>
+<path fill="none" stroke="black" d="M607.19,-1871.7C606.75,-1863.98 606.22,-1854.71 605.73,-1846.11"/>
+<polygon fill="black" stroke="black" points="609.22,-1845.89 605.16,-1836.1 602.24,-1846.29 609.22,-1845.89"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<ellipse fill="none" stroke="black" cx="891.18" cy="-1530" rx="156.77" ry="18"/>
+<text text-anchor="middle" x="891.18" y="-1526.3" font-family="Times,serif" font-size="14.00">key.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 32 -->
+<g id="node25" class="node">
+<title>32</title>
+<polygon fill="none" stroke="black" points="1033.68,-1476 962.68,-1476 962.68,-1440 1033.68,-1440 1033.68,-1476"/>
+<text text-anchor="middle" x="998.18" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 4&#45;&gt;32 -->
+<g id="edge21" class="edge">
+<title>4&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M916.81,-1512.23C930.77,-1503.1 948.23,-1491.68 963.4,-1481.76"/>
+<polygon fill="black" stroke="black" points="965.57,-1484.52 972.02,-1476.12 961.73,-1478.66 965.57,-1484.52"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<ellipse fill="none" stroke="black" cx="1350.18" cy="-882" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1350.18" y="-878.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 42 -->
+<g id="node34" class="node">
+<title>42</title>
+<polygon fill="none" stroke="black" points="1385.68,-828 1314.68,-828 1314.68,-792 1385.68,-792 1385.68,-828"/>
+<text text-anchor="middle" x="1350.18" y="-806.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 5&#45;&gt;42 -->
+<g id="edge32" class="edge">
+<title>5&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1350.18,-863.7C1350.18,-855.98 1350.18,-846.71 1350.18,-838.11"/>
+<polygon fill="black" stroke="black" points="1353.68,-838.1 1350.18,-828.1 1346.68,-838.1 1353.68,-838.1"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<ellipse fill="none" stroke="black" cx="1908.18" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="1908.18" y="-1886.3" font-family="Times,serif" font-size="14.00">value.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 48 -->
+<g id="node40" class="node">
+<title>48</title>
+<polygon fill="none" stroke="black" points="2004.68,-1836 1811.68,-1836 1811.68,-1800 2004.68,-1800 2004.68,-1836"/>
+<text text-anchor="middle" x="1908.18" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 6&#45;&gt;48 -->
+<g id="edge38" class="edge">
+<title>6&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1908.18,-1871.7C1908.18,-1863.98 1908.18,-1854.71 1908.18,-1846.11"/>
+<polygon fill="black" stroke="black" points="1911.68,-1846.1 1908.18,-1836.1 1904.68,-1846.1 1911.68,-1846.1"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<ellipse fill="none" stroke="black" cx="1965.18" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="1965.18" y="-1526.3" font-family="Times,serif" font-size="14.00">value.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 53 -->
+<g id="node45" class="node">
+<title>53</title>
+<polygon fill="none" stroke="black" points="1641.68,-1476 1570.68,-1476 1570.68,-1440 1641.68,-1440 1641.68,-1476"/>
+<text text-anchor="middle" x="1606.18" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 7&#45;&gt;53 -->
+<g id="edge45" class="edge">
+<title>7&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1888.17,-1513.98C1816.3,-1499.97 1711.47,-1479.53 1651.95,-1467.92"/>
+<polygon fill="black" stroke="black" points="1652.58,-1464.48 1642.09,-1466 1651.24,-1471.35 1652.58,-1464.48"/>
+</g>
+<!-- 20 -->
+<g id="node13" class="node">
+<title>20</title>
+<polygon fill="none" stroke="black" points="627.68,-1620 458.68,-1620 458.68,-1584 627.68,-1584 627.68,-1620"/>
+<text text-anchor="middle" x="543.18" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;20 -->
+<g id="edge5" class="edge">
+<title>16&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M789.09,-1655.97C741.72,-1645.76 681.37,-1632.76 632.31,-1622.2"/>
+<polygon fill="black" stroke="black" points="632.86,-1618.73 622.34,-1620.05 631.38,-1625.58 632.86,-1618.73"/>
+</g>
+<!-- 18 -->
+<g id="node11" class="node">
+<title>18</title>
+<polygon fill="none" stroke="black" points="401.68,-1764 40.68,-1764 40.68,-1728 401.68,-1728 401.68,-1764"/>
+<text text-anchor="middle" x="221.18" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 17&#45;&gt;18 -->
+<g id="edge3" class="edge">
+<title>17&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M215.91,-1799.7C216.68,-1791.98 217.61,-1782.71 218.47,-1774.11"/>
+<polygon fill="black" stroke="black" points="221.96,-1774.4 219.47,-1764.1 214.99,-1773.71 221.96,-1774.4"/>
+</g>
+<!-- 19 -->
+<g id="node12" class="node">
+<title>19</title>
+<polygon fill="none" stroke="black" points="405.68,-1692 194.68,-1692 194.68,-1656 405.68,-1656 405.68,-1692"/>
+<text text-anchor="middle" x="300.18" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 18&#45;&gt;19 -->
+<g id="edge4" class="edge">
+<title>18&#45;&gt;19</title>
+<path fill="none" stroke="black" d="M240.71,-1727.7C250.56,-1718.97 262.67,-1708.24 273.38,-1698.75"/>
+<polygon fill="black" stroke="black" points="275.72,-1701.36 280.88,-1692.1 271.07,-1696.12 275.72,-1701.36"/>
+</g>
+<!-- 19&#45;&gt;20 -->
+<g id="edge6" class="edge">
+<title>19&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M359.32,-1655.97C393.86,-1646.01 437.63,-1633.41 473.8,-1622.99"/>
+<polygon fill="black" stroke="black" points="475.14,-1626.24 483.78,-1620.11 473.21,-1619.52 475.14,-1626.24"/>
+</g>
+<!-- 21 -->
+<g id="node14" class="node">
+<title>21</title>
+<polygon fill="none" stroke="black" points="716.68,-1548 369.68,-1548 369.68,-1512 716.68,-1512 716.68,-1548"/>
+<text text-anchor="middle" x="543.18" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 20&#45;&gt;21 -->
+<g id="edge7" class="edge">
+<title>20&#45;&gt;21</title>
+<path fill="none" stroke="black" d="M543.18,-1583.7C543.18,-1575.98 543.18,-1566.71 543.18,-1558.11"/>
+<polygon fill="black" stroke="black" points="546.68,-1558.1 543.18,-1548.1 539.68,-1558.1 546.68,-1558.1"/>
+</g>
+<!-- 21&#45;&gt;22 -->
+<g id="edge8" class="edge">
+<title>21&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M543.18,-1511.7C543.18,-1503.98 543.18,-1494.71 543.18,-1486.11"/>
+<polygon fill="black" stroke="black" points="546.68,-1486.1 543.18,-1476.1 539.68,-1486.1 546.68,-1486.1"/>
+</g>
+<!-- 23 -->
+<g id="node16" class="node">
+<title>23</title>
+<polygon fill="none" stroke="black" points="797.68,-1404 432.68,-1404 432.68,-1368 797.68,-1368 797.68,-1404"/>
+<text text-anchor="middle" x="615.18" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 22&#45;&gt;23 -->
+<g id="edge10" class="edge">
+<title>22&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M560.98,-1439.7C569.79,-1431.14 580.56,-1420.66 590.18,-1411.3"/>
+<polygon fill="black" stroke="black" points="592.86,-1413.58 597.59,-1404.1 587.98,-1408.57 592.86,-1413.58"/>
+</g>
+<!-- 24 -->
+<g id="node17" class="node">
+<title>24</title>
+<polygon fill="none" stroke="black" points="740.68,-1332 511.68,-1332 511.68,-1296 740.68,-1296 740.68,-1332"/>
+<text text-anchor="middle" x="626.18" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 23&#45;&gt;24 -->
+<g id="edge11" class="edge">
+<title>23&#45;&gt;24</title>
+<path fill="none" stroke="black" d="M617.9,-1367.7C619.11,-1359.98 620.57,-1350.71 621.92,-1342.11"/>
+<polygon fill="black" stroke="black" points="625.4,-1342.53 623.49,-1332.1 618.48,-1341.44 625.4,-1342.53"/>
+</g>
+<!-- 25 -->
+<g id="node18" class="node">
+<title>25</title>
+<polygon fill="none" stroke="black" points="863.68,-1116 520.68,-1116 520.68,-1080 863.68,-1080 863.68,-1116"/>
+<text text-anchor="middle" x="692.18" y="-1094.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 24&#45;&gt;25 -->
+<g id="edge12" class="edge">
+<title>24&#45;&gt;25</title>
+<path fill="none" stroke="black" d="M631.47,-1295.85C642.94,-1258.68 670.15,-1170.44 683.91,-1125.82"/>
+<polygon fill="black" stroke="black" points="687.26,-1126.82 686.87,-1116.23 680.58,-1124.76 687.26,-1126.82"/>
+</g>
+<!-- 38 -->
+<g id="node31" class="node">
+<title>38</title>
+<polygon fill="none" stroke="black" points="1082.68,-1044 913.68,-1044 913.68,-1008 1082.68,-1008 1082.68,-1044"/>
+<text text-anchor="middle" x="998.18" y="-1022.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 25&#45;&gt;38 -->
+<g id="edge27" class="edge">
+<title>25&#45;&gt;38</title>
+<path fill="none" stroke="black" d="M766.65,-1079.97C811.06,-1069.8 867.6,-1056.87 913.68,-1046.33"/>
+<polygon fill="black" stroke="black" points="914.68,-1049.69 923.65,-1044.05 913.12,-1042.87 914.68,-1049.69"/>
+</g>
+<!-- 30 -->
+<g id="node23" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="1322.68,-1620 1153.68,-1620 1153.68,-1584 1322.68,-1584 1322.68,-1620"/>
+<text text-anchor="middle" x="1238.18" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 26&#45;&gt;30 -->
+<g id="edge17" class="edge">
+<title>26&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M1238.18,-1655.7C1238.18,-1647.98 1238.18,-1638.71 1238.18,-1630.11"/>
+<polygon fill="black" stroke="black" points="1241.68,-1630.1 1238.18,-1620.1 1234.68,-1630.1 1241.68,-1630.1"/>
+</g>
+<!-- 28 -->
+<g id="node21" class="node">
+<title>28</title>
+<polygon fill="none" stroke="black" points="782.68,-1764 421.68,-1764 421.68,-1728 782.68,-1728 782.68,-1764"/>
+<text text-anchor="middle" x="602.18" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 27&#45;&gt;28 -->
+<g id="edge15" class="edge">
+<title>27&#45;&gt;28</title>
+<path fill="none" stroke="black" d="M603.69,-1799.7C603.47,-1791.98 603.2,-1782.71 602.96,-1774.11"/>
+<polygon fill="black" stroke="black" points="606.46,-1774 602.67,-1764.1 599.46,-1774.2 606.46,-1774"/>
+</g>
+<!-- 29 -->
+<g id="node22" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="673.68,-1692 462.68,-1692 462.68,-1656 673.68,-1656 673.68,-1692"/>
+<text text-anchor="middle" x="568.18" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 28&#45;&gt;29 -->
+<g id="edge16" class="edge">
+<title>28&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M593.78,-1727.7C589.91,-1719.73 585.23,-1710.1 580.94,-1701.26"/>
+<polygon fill="black" stroke="black" points="584.01,-1699.57 576.49,-1692.1 577.71,-1702.63 584.01,-1699.57"/>
+</g>
+<!-- 29&#45;&gt;30 -->
+<g id="edge18" class="edge">
+<title>29&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M673.75,-1657.18C676.93,-1656.77 680.08,-1656.38 683.18,-1656 844.38,-1636.4 1032.65,-1619.73 1143.28,-1610.56"/>
+<polygon fill="black" stroke="black" points="1143.73,-1614.04 1153.41,-1609.73 1143.16,-1607.06 1143.73,-1614.04"/>
+</g>
+<!-- 31 -->
+<g id="node24" class="node">
+<title>31</title>
+<polygon fill="none" stroke="black" points="1412.68,-1548 1065.68,-1548 1065.68,-1512 1412.68,-1512 1412.68,-1548"/>
+<text text-anchor="middle" x="1239.18" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 30&#45;&gt;31 -->
+<g id="edge19" class="edge">
+<title>30&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M1238.43,-1583.7C1238.54,-1575.98 1238.67,-1566.71 1238.79,-1558.11"/>
+<polygon fill="black" stroke="black" points="1242.29,-1558.15 1238.94,-1548.1 1235.3,-1558.05 1242.29,-1558.15"/>
+</g>
+<!-- 31&#45;&gt;32 -->
+<g id="edge20" class="edge">
+<title>31&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M1180.54,-1511.97C1138.46,-1499.74 1082.57,-1483.51 1043.81,-1472.25"/>
+<polygon fill="black" stroke="black" points="1044.43,-1468.79 1033.85,-1469.36 1042.48,-1475.51 1044.43,-1468.79"/>
+</g>
+<!-- 33 -->
+<g id="node26" class="node">
+<title>33</title>
+<polygon fill="none" stroke="black" points="1180.68,-1404 815.68,-1404 815.68,-1368 1180.68,-1368 1180.68,-1404"/>
+<text text-anchor="middle" x="998.18" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 32&#45;&gt;33 -->
+<g id="edge22" class="edge">
+<title>32&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M998.18,-1439.7C998.18,-1431.98 998.18,-1422.71 998.18,-1414.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1414.1 998.18,-1404.1 994.68,-1414.1 1001.68,-1414.1"/>
+</g>
+<!-- 34 -->
+<g id="node27" class="node">
+<title>34</title>
+<polygon fill="none" stroke="black" points="1112.68,-1332 883.68,-1332 883.68,-1296 1112.68,-1296 1112.68,-1332"/>
+<text text-anchor="middle" x="998.18" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 33&#45;&gt;34 -->
+<g id="edge23" class="edge">
+<title>33&#45;&gt;34</title>
+<path fill="none" stroke="black" d="M998.18,-1367.7C998.18,-1359.98 998.18,-1350.71 998.18,-1342.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1342.1 998.18,-1332.1 994.68,-1342.1 1001.68,-1342.1"/>
+</g>
+<!-- 35 -->
+<g id="node28" class="node">
+<title>35</title>
+<polygon fill="none" stroke="black" points="1112.68,-1260 883.68,-1260 883.68,-1224 1112.68,-1224 1112.68,-1260"/>
+<text text-anchor="middle" x="998.18" y="-1238.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 1, 3, 2])</text>
+</g>
+<!-- 34&#45;&gt;35 -->
+<g id="edge24" class="edge">
+<title>34&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M998.18,-1295.7C998.18,-1287.98 998.18,-1278.71 998.18,-1270.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1270.1 998.18,-1260.1 994.68,-1270.1 1001.68,-1270.1"/>
+</g>
+<!-- 36 -->
+<g id="node29" class="node">
+<title>36</title>
+<polygon fill="none" stroke="black" points="1169.68,-1188 826.68,-1188 826.68,-1152 1169.68,-1152 1169.68,-1188"/>
+<text text-anchor="middle" x="998.18" y="-1166.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 64, 14], reverse=0)</text>
+</g>
+<!-- 35&#45;&gt;36 -->
+<g id="edge25" class="edge">
+<title>35&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M998.18,-1223.7C998.18,-1215.98 998.18,-1206.71 998.18,-1198.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1198.1 998.18,-1188.1 994.68,-1198.1 1001.68,-1198.1"/>
+</g>
+<!-- 37 -->
+<g id="node30" class="node">
+<title>37</title>
+<polygon fill="none" stroke="black" points="1103.68,-1116 892.68,-1116 892.68,-1080 1103.68,-1080 1103.68,-1116"/>
+<text text-anchor="middle" x="998.18" y="-1094.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 36&#45;&gt;37 -->
+<g id="edge26" class="edge">
+<title>36&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M998.18,-1151.7C998.18,-1143.98 998.18,-1134.71 998.18,-1126.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1126.1 998.18,-1116.1 994.68,-1126.1 1001.68,-1126.1"/>
+</g>
+<!-- 37&#45;&gt;38 -->
+<g id="edge28" class="edge">
+<title>37&#45;&gt;38</title>
+<path fill="none" stroke="black" d="M998.18,-1079.7C998.18,-1071.98 998.18,-1062.71 998.18,-1054.11"/>
+<polygon fill="black" stroke="black" points="1001.68,-1054.1 998.18,-1044.1 994.68,-1054.1 1001.68,-1054.1"/>
+</g>
+<!-- 39 -->
+<g id="node32" class="node">
+<title>39</title>
+<polygon fill="none" stroke="black" points="1216.68,-972 851.68,-972 851.68,-936 1216.68,-936 1216.68,-972"/>
+<text text-anchor="middle" x="1034.18" y="-950.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 14], reverse=0)</text>
+</g>
+<!-- 38&#45;&gt;39 -->
+<g id="edge29" class="edge">
+<title>38&#45;&gt;39</title>
+<path fill="none" stroke="black" d="M1007.08,-1007.7C1011.18,-999.73 1016.13,-990.1 1020.67,-981.26"/>
+<polygon fill="black" stroke="black" points="1023.92,-982.6 1025.39,-972.1 1017.7,-979.4 1023.92,-982.6"/>
+</g>
+<!-- 41 -->
+<g id="node33" class="node">
+<title>41</title>
+<polygon fill="none" stroke="black" points="1105.68,-900 998.68,-900 998.68,-864 1105.68,-864 1105.68,-900"/>
+<text text-anchor="middle" x="1052.18" y="-878.3" font-family="Times,serif" font-size="14.00">divide(·, 8.0)</text>
+</g>
+<!-- 39&#45;&gt;41 -->
+<g id="edge30" class="edge">
+<title>39&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1038.63,-935.7C1040.64,-927.9 1043.05,-918.51 1045.28,-909.83"/>
+<polygon fill="black" stroke="black" points="1048.68,-910.66 1047.78,-900.1 1041.9,-908.92 1048.68,-910.66"/>
+</g>
+<!-- 41&#45;&gt;42 -->
+<g id="edge31" class="edge">
+<title>41&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1106.06,-868.07C1111.84,-866.69 1117.63,-865.31 1123.18,-864 1185.98,-849.15 1258.61,-832.24 1304.46,-821.6"/>
+<polygon fill="black" stroke="black" points="1305.44,-824.96 1314.39,-819.29 1303.86,-818.14 1305.44,-824.96"/>
+</g>
+<!-- 43 -->
+<g id="node35" class="node">
+<title>43</title>
+<polygon fill="none" stroke="black" points="1445.18,-756 1271.18,-756 1271.18,-720 1445.18,-720 1445.18,-756"/>
+<text text-anchor="middle" x="1358.18" y="-734.3" font-family="Times,serif" font-size="14.00">nn.softmax(·| axis=&#45;1)</text>
+</g>
+<!-- 42&#45;&gt;43 -->
+<g id="edge33" class="edge">
+<title>42&#45;&gt;43</title>
+<path fill="none" stroke="black" d="M1352.16,-791.7C1353.04,-783.98 1354.1,-774.71 1355.08,-766.11"/>
+<polygon fill="black" stroke="black" points="1358.57,-766.44 1356.23,-756.1 1351.61,-765.64 1358.57,-766.44"/>
+</g>
+<!-- 44 -->
+<g id="node36" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="1472.68,-684 1289.68,-684 1289.68,-648 1472.68,-648 1472.68,-684"/>
+<text text-anchor="middle" x="1381.18" y="-662.3" font-family="Times,serif" font-size="14.00">nn.dropout(·| rate=0.1)</text>
+</g>
+<!-- 43&#45;&gt;44 -->
+<g id="edge34" class="edge">
+<title>43&#45;&gt;44</title>
+<path fill="none" stroke="black" d="M1363.87,-719.7C1366.43,-711.9 1369.52,-702.51 1372.37,-693.83"/>
+<polygon fill="black" stroke="black" points="1375.77,-694.7 1375.56,-684.1 1369.12,-692.51 1375.77,-694.7"/>
+</g>
+<!-- 45 -->
+<g id="node37" class="node">
+<title>45</title>
+<polygon fill="none" stroke="black" points="1477.18,-612 1309.18,-612 1309.18,-576 1477.18,-576 1477.18,-612"/>
+<text text-anchor="middle" x="1393.18" y="-590.3" font-family="Times,serif" font-size="14.00">TupleGetItem(idx=0)</text>
+</g>
+<!-- 44&#45;&gt;45 -->
+<g id="edge35" class="edge">
+<title>44&#45;&gt;45</title>
+<path fill="none" stroke="black" d="M1384.15,-647.7C1385.47,-639.98 1387.06,-630.71 1388.53,-622.11"/>
+<polygon fill="black" stroke="black" points="1392.01,-622.55 1390.25,-612.1 1385.11,-621.37 1392.01,-622.55"/>
+</g>
+<!-- 46 -->
+<g id="node38" class="node">
+<title>46</title>
+<polygon fill="none" stroke="black" points="1570.68,-540 1227.68,-540 1227.68,-504 1570.68,-504 1570.68,-540"/>
+<text text-anchor="middle" x="1399.18" y="-518.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 14], reverse=0)</text>
+</g>
+<!-- 45&#45;&gt;46 -->
+<g id="edge36" class="edge">
+<title>45&#45;&gt;46</title>
+<path fill="none" stroke="black" d="M1394.67,-575.7C1395.33,-567.98 1396.12,-558.71 1396.86,-550.11"/>
+<polygon fill="black" stroke="black" points="1400.35,-550.37 1397.72,-540.1 1393.37,-549.77 1400.35,-550.37"/>
+</g>
+<!-- 58 -->
+<g id="node50" class="node">
+<title>58</title>
+<polygon fill="none" stroke="black" points="1582.68,-468 1413.68,-468 1413.68,-432 1582.68,-432 1582.68,-468"/>
+<text text-anchor="middle" x="1498.18" y="-446.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 46&#45;&gt;58 -->
+<g id="edge50" class="edge">
+<title>46&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1423.4,-503.88C1436.1,-494.89 1451.85,-483.76 1465.61,-474.03"/>
+<polygon fill="black" stroke="black" points="1467.73,-476.82 1473.87,-468.19 1463.69,-471.11 1467.73,-476.82"/>
+</g>
+<!-- 51 -->
+<g id="node43" class="node">
+<title>51</title>
+<polygon fill="none" stroke="black" points="1692.68,-1620 1523.68,-1620 1523.68,-1584 1692.68,-1584 1692.68,-1620"/>
+<text text-anchor="middle" x="1608.18" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 47&#45;&gt;51 -->
+<g id="edge41" class="edge">
+<title>47&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1608.18,-1655.7C1608.18,-1647.98 1608.18,-1638.71 1608.18,-1630.11"/>
+<polygon fill="black" stroke="black" points="1611.68,-1630.1 1608.18,-1620.1 1604.68,-1630.1 1611.68,-1630.1"/>
+</g>
+<!-- 49 -->
+<g id="node41" class="node">
+<title>49</title>
+<polygon fill="none" stroke="black" points="2088.68,-1764 1727.68,-1764 1727.68,-1728 2088.68,-1728 2088.68,-1764"/>
+<text text-anchor="middle" x="1908.18" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 48&#45;&gt;49 -->
+<g id="edge39" class="edge">
+<title>48&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1908.18,-1799.7C1908.18,-1791.98 1908.18,-1782.71 1908.18,-1774.11"/>
+<polygon fill="black" stroke="black" points="1911.68,-1774.1 1908.18,-1764.1 1904.68,-1774.1 1911.68,-1774.1"/>
+</g>
+<!-- 50 -->
+<g id="node42" class="node">
+<title>50</title>
+<polygon fill="none" stroke="black" points="2013.68,-1692 1802.68,-1692 1802.68,-1656 2013.68,-1656 2013.68,-1692"/>
+<text text-anchor="middle" x="1908.18" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 49&#45;&gt;50 -->
+<g id="edge40" class="edge">
+<title>49&#45;&gt;50</title>
+<path fill="none" stroke="black" d="M1908.18,-1727.7C1908.18,-1719.98 1908.18,-1710.71 1908.18,-1702.11"/>
+<polygon fill="black" stroke="black" points="1911.68,-1702.1 1908.18,-1692.1 1904.68,-1702.1 1911.68,-1702.1"/>
+</g>
+<!-- 50&#45;&gt;51 -->
+<g id="edge42" class="edge">
+<title>50&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1835.18,-1655.97C1791.63,-1645.8 1736.21,-1632.87 1691.03,-1622.33"/>
+<polygon fill="black" stroke="black" points="1691.79,-1618.91 1681.25,-1620.05 1690.2,-1625.73 1691.79,-1618.91"/>
+</g>
+<!-- 52 -->
+<g id="node44" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="1779.68,-1548 1432.68,-1548 1432.68,-1512 1779.68,-1512 1779.68,-1548"/>
+<text text-anchor="middle" x="1606.18" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 51&#45;&gt;52 -->
+<g id="edge43" class="edge">
+<title>51&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M1607.69,-1583.7C1607.47,-1575.98 1607.2,-1566.71 1606.96,-1558.11"/>
+<polygon fill="black" stroke="black" points="1610.46,-1558 1606.67,-1548.1 1603.46,-1558.2 1610.46,-1558"/>
+</g>
+<!-- 52&#45;&gt;53 -->
+<g id="edge44" class="edge">
+<title>52&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1606.18,-1511.7C1606.18,-1503.98 1606.18,-1494.71 1606.18,-1486.11"/>
+<polygon fill="black" stroke="black" points="1609.68,-1486.1 1606.18,-1476.1 1602.68,-1486.1 1609.68,-1486.1"/>
+</g>
+<!-- 54 -->
+<g id="node46" class="node">
+<title>54</title>
+<polygon fill="none" stroke="black" points="1787.68,-1404 1422.68,-1404 1422.68,-1368 1787.68,-1368 1787.68,-1404"/>
+<text text-anchor="middle" x="1605.18" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 53&#45;&gt;54 -->
+<g id="edge46" class="edge">
+<title>53&#45;&gt;54</title>
+<path fill="none" stroke="black" d="M1605.94,-1439.7C1605.82,-1431.98 1605.69,-1422.71 1605.57,-1414.11"/>
+<polygon fill="black" stroke="black" points="1609.07,-1414.05 1605.43,-1404.1 1602.07,-1414.15 1609.07,-1414.05"/>
+</g>
+<!-- 55 -->
+<g id="node47" class="node">
+<title>55</title>
+<polygon fill="none" stroke="black" points="1719.68,-1332 1490.68,-1332 1490.68,-1296 1719.68,-1296 1719.68,-1332"/>
+<text text-anchor="middle" x="1605.18" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 54&#45;&gt;55 -->
+<g id="edge47" class="edge">
+<title>54&#45;&gt;55</title>
+<path fill="none" stroke="black" d="M1605.18,-1367.7C1605.18,-1359.98 1605.18,-1350.71 1605.18,-1342.11"/>
+<polygon fill="black" stroke="black" points="1608.68,-1342.1 1605.18,-1332.1 1601.68,-1342.1 1608.68,-1342.1"/>
+</g>
+<!-- 56 -->
+<g id="node48" class="node">
+<title>56</title>
+<polygon fill="none" stroke="black" points="1775.68,-1260 1432.68,-1260 1432.68,-1224 1775.68,-1224 1775.68,-1260"/>
+<text text-anchor="middle" x="1604.18" y="-1238.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 55&#45;&gt;56 -->
+<g id="edge48" class="edge">
+<title>55&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1604.94,-1295.7C1604.82,-1287.98 1604.69,-1278.71 1604.57,-1270.11"/>
+<polygon fill="black" stroke="black" points="1608.07,-1270.05 1604.43,-1260.1 1601.07,-1270.15 1608.07,-1270.05"/>
+</g>
+<!-- 57 -->
+<g id="node49" class="node">
+<title>57</title>
+<polygon fill="none" stroke="black" points="1706.68,-1044 1495.68,-1044 1495.68,-1008 1706.68,-1008 1706.68,-1044"/>
+<text text-anchor="middle" x="1601.18" y="-1022.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 56&#45;&gt;57 -->
+<g id="edge49" class="edge">
+<title>56&#45;&gt;57</title>
+<path fill="none" stroke="black" d="M1603.94,-1223.85C1603.42,-1186.83 1602.19,-1099.18 1601.57,-1054.39"/>
+<polygon fill="black" stroke="black" points="1605.06,-1054.18 1601.42,-1044.23 1598.06,-1054.28 1605.06,-1054.18"/>
+</g>
+<!-- 57&#45;&gt;58 -->
+<g id="edge51" class="edge">
+<title>57&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1601.18,-1007.95C1601.18,-981.29 1601.18,-928.11 1601.18,-883 1601.18,-883 1601.18,-883 1601.18,-593 1601.18,-552.36 1603.61,-537.21 1580.18,-504 1571.45,-491.62 1558.82,-481.42 1546.08,-473.4"/>
+<polygon fill="black" stroke="black" points="1547.67,-470.27 1537.29,-468.19 1544.1,-476.3 1547.67,-470.27"/>
+</g>
+<!-- 59 -->
+<g id="node51" class="node">
+<title>59</title>
+<polygon fill="none" stroke="black" points="1680.68,-396 1315.68,-396 1315.68,-360 1680.68,-360 1680.68,-396"/>
+<text text-anchor="middle" x="1498.18" y="-374.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 64], reverse=0)</text>
+</g>
+<!-- 58&#45;&gt;59 -->
+<g id="edge52" class="edge">
+<title>58&#45;&gt;59</title>
+<path fill="none" stroke="black" d="M1498.18,-431.7C1498.18,-423.98 1498.18,-414.71 1498.18,-406.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-406.1 1498.18,-396.1 1494.68,-406.1 1501.68,-406.1"/>
+</g>
+<!-- 60 -->
+<g id="node52" class="node">
+<title>60</title>
+<polygon fill="none" stroke="black" points="1612.68,-324 1383.68,-324 1383.68,-288 1612.68,-288 1612.68,-324"/>
+<text text-anchor="middle" x="1498.18" y="-302.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 59&#45;&gt;60 -->
+<g id="edge53" class="edge">
+<title>59&#45;&gt;60</title>
+<path fill="none" stroke="black" d="M1498.18,-359.7C1498.18,-351.98 1498.18,-342.71 1498.18,-334.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-334.1 1498.18,-324.1 1494.68,-334.1 1501.68,-334.1"/>
+</g>
+<!-- 61 -->
+<g id="node53" class="node">
+<title>61</title>
+<polygon fill="none" stroke="black" points="1530.68,-252 1465.68,-252 1465.68,-216 1530.68,-216 1530.68,-252"/>
+<text text-anchor="middle" x="1498.18" y="-230.3" font-family="Times,serif" font-size="14.00">copy(·)</text>
+</g>
+<!-- 60&#45;&gt;61 -->
+<g id="edge54" class="edge">
+<title>60&#45;&gt;61</title>
+<path fill="none" stroke="black" d="M1498.18,-287.7C1498.18,-279.98 1498.18,-270.71 1498.18,-262.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-262.1 1498.18,-252.1 1494.68,-262.1 1501.68,-262.1"/>
+</g>
+<!-- 62 -->
+<g id="node54" class="node">
+<title>62</title>
+<polygon fill="none" stroke="black" points="1671.68,-180 1324.68,-180 1324.68,-144 1671.68,-144 1671.68,-180"/>
+<text text-anchor="middle" x="1498.18" y="-158.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 61&#45;&gt;62 -->
+<g id="edge55" class="edge">
+<title>61&#45;&gt;62</title>
+<path fill="none" stroke="black" d="M1498.18,-215.7C1498.18,-207.98 1498.18,-198.71 1498.18,-190.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-190.1 1498.18,-180.1 1494.68,-190.1 1501.68,-190.1"/>
+</g>
+<!-- 63 -->
+<g id="node55" class="node">
+<title>63</title>
+<polygon fill="none" stroke="black" points="1541.18,-108 1455.18,-108 1455.18,-72 1541.18,-72 1541.18,-108"/>
+<text text-anchor="middle" x="1498.18" y="-86.3" font-family="Times,serif" font-size="14.00">Tuple[...])</text>
+</g>
+<!-- 62&#45;&gt;63 -->
+<g id="edge56" class="edge">
+<title>62&#45;&gt;63</title>
+<path fill="none" stroke="black" d="M1498.18,-143.7C1498.18,-135.98 1498.18,-126.71 1498.18,-118.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-118.1 1498.18,-108.1 1494.68,-118.1 1501.68,-118.1"/>
+</g>
+<!-- 64 -->
+<g id="node56" class="node">
+<title>64</title>
+<polygon fill="none" stroke="black" points="1538.18,-36 1458.18,-36 1458.18,0 1538.18,0 1538.18,-36"/>
+<text text-anchor="middle" x="1498.18" y="-14.3" font-family="Times,serif" font-size="14.00">Function</text>
+</g>
+<!-- 63&#45;&gt;64 -->
+<g id="edge57" class="edge">
+<title>63&#45;&gt;64</title>
+<path fill="none" stroke="black" d="M1498.18,-71.7C1498.18,-63.98 1498.18,-54.71 1498.18,-46.11"/>
+<polygon fill="black" stroke="black" points="1501.68,-46.1 1498.18,-36.1 1494.68,-46.1 1501.68,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert-tvm_65_2.svg b/images/bert-pytorch/bert-tvm_65_2.svg
new file mode 100644
index 0000000..4b26fbd
--- /dev/null
+++ b/images/bert-pytorch/bert-tvm_65_2.svg
@@ -0,0 +1,667 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="2122pt" height="1916pt"
+ viewBox="0.00 0.00 2122.14 1916.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1912)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1912 2118.14,-1912 2118.14,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="1461.64" cy="-1746" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="1461.64" y="-1742.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 16 -->
+<g id="node9" class="node">
+<title>16</title>
+<polygon fill="none" stroke="black" points="1620.64,-1692 1268.64,-1692 1268.64,-1656 1620.64,-1656 1620.64,-1692"/>
+<text text-anchor="middle" x="1444.64" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;16 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;16</title>
+<path fill="none" stroke="black" d="M1457.43,-1727.7C1455.54,-1719.9 1453.26,-1710.51 1451.15,-1701.83"/>
+<polygon fill="black" stroke="black" points="1454.55,-1701 1448.79,-1692.1 1447.75,-1702.65 1454.55,-1701"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="1092.64" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="1092.64" y="-1886.3" font-family="Times,serif" font-size="14.00">query.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 17 -->
+<g id="node10" class="node">
+<title>17</title>
+<polygon fill="none" stroke="black" points="1189.14,-1836 996.14,-1836 996.14,-1800 1189.14,-1800 1189.14,-1836"/>
+<text text-anchor="middle" x="1092.64" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 1&#45;&gt;17 -->
+<g id="edge2" class="edge">
+<title>1&#45;&gt;17</title>
+<path fill="none" stroke="black" d="M1092.64,-1871.7C1092.64,-1863.98 1092.64,-1854.71 1092.64,-1846.11"/>
+<polygon fill="black" stroke="black" points="1096.14,-1846.1 1092.64,-1836.1 1089.14,-1846.1 1096.14,-1846.1"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<ellipse fill="none" stroke="black" cx="863.64" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="863.64" y="-1526.3" font-family="Times,serif" font-size="14.00">query.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 22 -->
+<g id="node15" class="node">
+<title>22</title>
+<polygon fill="none" stroke="black" points="1002.14,-1476 931.14,-1476 931.14,-1440 1002.14,-1440 1002.14,-1476"/>
+<text text-anchor="middle" x="966.64" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 2&#45;&gt;22 -->
+<g id="edge9" class="edge">
+<title>2&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M888.57,-1512.05C901.88,-1503.01 918.46,-1491.74 932.9,-1481.93"/>
+<polygon fill="black" stroke="black" points="935.27,-1484.55 941.58,-1476.03 931.34,-1478.76 935.27,-1484.55"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<ellipse fill="none" stroke="black" cx="564.64" cy="-1890" rx="189.57" ry="18"/>
+<text text-anchor="middle" x="564.64" y="-1886.3" font-family="Times,serif" font-size="14.00">key.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 26 -->
+<g id="node19" class="node">
+<title>26</title>
+<polygon fill="none" stroke="black" points="661.14,-1836 468.14,-1836 468.14,-1800 661.14,-1800 661.14,-1836"/>
+<text text-anchor="middle" x="564.64" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 3&#45;&gt;26 -->
+<g id="edge13" class="edge">
+<title>3&#45;&gt;26</title>
+<path fill="none" stroke="black" d="M564.64,-1871.7C564.64,-1863.98 564.64,-1854.71 564.64,-1846.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1846.1 564.64,-1836.1 561.14,-1846.1 568.14,-1846.1"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<ellipse fill="none" stroke="black" cx="156.64" cy="-1530" rx="156.77" ry="18"/>
+<text text-anchor="middle" x="156.64" y="-1526.3" font-family="Times,serif" font-size="14.00">key.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 31 -->
+<g id="node24" class="node">
+<title>31</title>
+<polygon fill="none" stroke="black" points="540.14,-1476 469.14,-1476 469.14,-1440 540.14,-1440 540.14,-1476"/>
+<text text-anchor="middle" x="504.64" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 4&#45;&gt;31 -->
+<g id="edge20" class="edge">
+<title>4&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M230.86,-1514.07C299.97,-1500.17 400.8,-1479.89 458.85,-1468.21"/>
+<polygon fill="black" stroke="black" points="459.71,-1471.61 468.83,-1466.2 458.33,-1464.74 459.71,-1471.61"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<ellipse fill="none" stroke="black" cx="1325.64" cy="-882" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1325.64" y="-878.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 41 -->
+<g id="node33" class="node">
+<title>41</title>
+<polygon fill="none" stroke="black" points="1361.14,-828 1290.14,-828 1290.14,-792 1361.14,-792 1361.14,-828"/>
+<text text-anchor="middle" x="1325.64" y="-806.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 5&#45;&gt;41 -->
+<g id="edge31" class="edge">
+<title>5&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1325.64,-863.7C1325.64,-855.98 1325.64,-846.71 1325.64,-838.11"/>
+<polygon fill="black" stroke="black" points="1329.14,-838.1 1325.64,-828.1 1322.14,-838.1 1329.14,-838.1"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<ellipse fill="none" stroke="black" cx="1885.64" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="1885.64" y="-1886.3" font-family="Times,serif" font-size="14.00">value.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 46 -->
+<g id="node38" class="node">
+<title>46</title>
+<polygon fill="none" stroke="black" points="1982.14,-1836 1789.14,-1836 1789.14,-1800 1982.14,-1800 1982.14,-1836"/>
+<text text-anchor="middle" x="1885.64" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 6&#45;&gt;46 -->
+<g id="edge36" class="edge">
+<title>6&#45;&gt;46</title>
+<path fill="none" stroke="black" d="M1885.64,-1871.7C1885.64,-1863.98 1885.64,-1854.71 1885.64,-1846.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1846.1 1885.64,-1836.1 1882.14,-1846.1 1889.14,-1846.1"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<ellipse fill="none" stroke="black" cx="1581.64" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="1581.64" y="-1526.3" font-family="Times,serif" font-size="14.00">value.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 51 -->
+<g id="node43" class="node">
+<title>51</title>
+<polygon fill="none" stroke="black" points="1617.14,-1476 1546.14,-1476 1546.14,-1440 1617.14,-1440 1617.14,-1476"/>
+<text text-anchor="middle" x="1581.64" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 7&#45;&gt;51 -->
+<g id="edge43" class="edge">
+<title>7&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1581.64,-1511.7C1581.64,-1503.98 1581.64,-1494.71 1581.64,-1486.11"/>
+<polygon fill="black" stroke="black" points="1585.14,-1486.1 1581.64,-1476.1 1578.14,-1486.1 1585.14,-1486.1"/>
+</g>
+<!-- 20 -->
+<g id="node13" class="node">
+<title>20</title>
+<polygon fill="none" stroke="black" points="1307.14,-1620 1138.14,-1620 1138.14,-1584 1307.14,-1584 1307.14,-1620"/>
+<text text-anchor="middle" x="1222.64" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;20 -->
+<g id="edge5" class="edge">
+<title>16&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M1390.61,-1655.97C1359.19,-1646.06 1319.41,-1633.51 1286.45,-1623.12"/>
+<polygon fill="black" stroke="black" points="1287.49,-1619.78 1276.9,-1620.11 1285.39,-1626.46 1287.49,-1619.78"/>
+</g>
+<!-- 29 -->
+<g id="node22" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="649.14,-1620 480.14,-1620 480.14,-1584 649.14,-1584 649.14,-1620"/>
+<text text-anchor="middle" x="564.64" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;29 -->
+<g id="edge16" class="edge">
+<title>16&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M1268.41,-1658.98C1085.06,-1644.4 804.87,-1622.11 659.51,-1610.55"/>
+<polygon fill="black" stroke="black" points="659.53,-1607.04 649.28,-1609.73 658.97,-1614.02 659.53,-1607.04"/>
+</g>
+<!-- 49 -->
+<g id="node41" class="node">
+<title>49</title>
+<polygon fill="none" stroke="black" points="1970.14,-1620 1801.14,-1620 1801.14,-1584 1970.14,-1584 1970.14,-1620"/>
+<text text-anchor="middle" x="1885.64" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;49 -->
+<g id="edge39" class="edge">
+<title>16&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1551.95,-1655.97C1624.98,-1644.37 1720.72,-1629.18 1790.91,-1618.04"/>
+<polygon fill="black" stroke="black" points="1791.75,-1621.45 1801.08,-1616.42 1790.65,-1614.53 1791.75,-1621.45"/>
+</g>
+<!-- 18 -->
+<g id="node11" class="node">
+<title>18</title>
+<polygon fill="none" stroke="black" points="1273.14,-1764 912.14,-1764 912.14,-1728 1273.14,-1728 1273.14,-1764"/>
+<text text-anchor="middle" x="1092.64" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 17&#45;&gt;18 -->
+<g id="edge3" class="edge">
+<title>17&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M1092.64,-1799.7C1092.64,-1791.98 1092.64,-1782.71 1092.64,-1774.11"/>
+<polygon fill="black" stroke="black" points="1096.14,-1774.1 1092.64,-1764.1 1089.14,-1774.1 1096.14,-1774.1"/>
+</g>
+<!-- 19 -->
+<g id="node12" class="node">
+<title>19</title>
+<polygon fill="none" stroke="black" points="1216.14,-1692 1005.14,-1692 1005.14,-1656 1216.14,-1656 1216.14,-1692"/>
+<text text-anchor="middle" x="1110.64" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 18&#45;&gt;19 -->
+<g id="edge4" class="edge">
+<title>18&#45;&gt;19</title>
+<path fill="none" stroke="black" d="M1097.09,-1727.7C1099.09,-1719.9 1101.51,-1710.51 1103.74,-1701.83"/>
+<polygon fill="black" stroke="black" points="1107.14,-1702.66 1106.24,-1692.1 1100.36,-1700.92 1107.14,-1702.66"/>
+</g>
+<!-- 19&#45;&gt;20 -->
+<g id="edge6" class="edge">
+<title>19&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M1138.03,-1655.88C1152.54,-1646.81 1170.55,-1635.55 1186.22,-1625.76"/>
+<polygon fill="black" stroke="black" points="1188.51,-1628.46 1195.13,-1620.19 1184.8,-1622.52 1188.51,-1628.46"/>
+</g>
+<!-- 21 -->
+<g id="node14" class="node">
+<title>21</title>
+<polygon fill="none" stroke="black" points="1396.14,-1548 1049.14,-1548 1049.14,-1512 1396.14,-1512 1396.14,-1548"/>
+<text text-anchor="middle" x="1222.64" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 20&#45;&gt;21 -->
+<g id="edge7" class="edge">
+<title>20&#45;&gt;21</title>
+<path fill="none" stroke="black" d="M1222.64,-1583.7C1222.64,-1575.98 1222.64,-1566.71 1222.64,-1558.11"/>
+<polygon fill="black" stroke="black" points="1226.14,-1558.1 1222.64,-1548.1 1219.14,-1558.1 1226.14,-1558.1"/>
+</g>
+<!-- 21&#45;&gt;22 -->
+<g id="edge8" class="edge">
+<title>21&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M1160.34,-1511.97C1114.53,-1499.44 1053.33,-1482.71 1012.08,-1471.43"/>
+<polygon fill="black" stroke="black" points="1012.87,-1468.01 1002.3,-1468.75 1011.03,-1474.77 1012.87,-1468.01"/>
+</g>
+<!-- 23 -->
+<g id="node16" class="node">
+<title>23</title>
+<polygon fill="none" stroke="black" points="1149.14,-1404 784.14,-1404 784.14,-1368 1149.14,-1368 1149.14,-1404"/>
+<text text-anchor="middle" x="966.64" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 22&#45;&gt;23 -->
+<g id="edge10" class="edge">
+<title>22&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M966.64,-1439.7C966.64,-1431.98 966.64,-1422.71 966.64,-1414.11"/>
+<polygon fill="black" stroke="black" points="970.14,-1414.1 966.64,-1404.1 963.14,-1414.1 970.14,-1414.1"/>
+</g>
+<!-- 24 -->
+<g id="node17" class="node">
+<title>24</title>
+<polygon fill="none" stroke="black" points="1081.14,-1332 852.14,-1332 852.14,-1296 1081.14,-1296 1081.14,-1332"/>
+<text text-anchor="middle" x="966.64" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 23&#45;&gt;24 -->
+<g id="edge11" class="edge">
+<title>23&#45;&gt;24</title>
+<path fill="none" stroke="black" d="M966.64,-1367.7C966.64,-1359.98 966.64,-1350.71 966.64,-1342.11"/>
+<polygon fill="black" stroke="black" points="970.14,-1342.1 966.64,-1332.1 963.14,-1342.1 970.14,-1342.1"/>
+</g>
+<!-- 25 -->
+<g id="node18" class="node">
+<title>25</title>
+<polygon fill="none" stroke="black" points="1138.14,-1116 795.14,-1116 795.14,-1080 1138.14,-1080 1138.14,-1116"/>
+<text text-anchor="middle" x="966.64" y="-1094.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 24&#45;&gt;25 -->
+<g id="edge12" class="edge">
+<title>24&#45;&gt;25</title>
+<path fill="none" stroke="black" d="M966.64,-1295.85C966.64,-1258.83 966.64,-1171.18 966.64,-1126.39"/>
+<polygon fill="black" stroke="black" points="970.14,-1126.23 966.64,-1116.23 963.14,-1126.23 970.14,-1126.23"/>
+</g>
+<!-- 37 -->
+<g id="node30" class="node">
+<title>37</title>
+<polygon fill="none" stroke="black" points="1051.14,-1044 882.14,-1044 882.14,-1008 1051.14,-1008 1051.14,-1044"/>
+<text text-anchor="middle" x="966.64" y="-1022.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 25&#45;&gt;37 -->
+<g id="edge26" class="edge">
+<title>25&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M966.64,-1079.7C966.64,-1071.98 966.64,-1062.71 966.64,-1054.11"/>
+<polygon fill="black" stroke="black" points="970.14,-1054.1 966.64,-1044.1 963.14,-1054.1 970.14,-1054.1"/>
+</g>
+<!-- 27 -->
+<g id="node20" class="node">
+<title>27</title>
+<polygon fill="none" stroke="black" points="745.14,-1764 384.14,-1764 384.14,-1728 745.14,-1728 745.14,-1764"/>
+<text text-anchor="middle" x="564.64" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 26&#45;&gt;27 -->
+<g id="edge14" class="edge">
+<title>26&#45;&gt;27</title>
+<path fill="none" stroke="black" d="M564.64,-1799.7C564.64,-1791.98 564.64,-1782.71 564.64,-1774.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1774.1 564.64,-1764.1 561.14,-1774.1 568.14,-1774.1"/>
+</g>
+<!-- 28 -->
+<g id="node21" class="node">
+<title>28</title>
+<polygon fill="none" stroke="black" points="670.14,-1692 459.14,-1692 459.14,-1656 670.14,-1656 670.14,-1692"/>
+<text text-anchor="middle" x="564.64" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 27&#45;&gt;28 -->
+<g id="edge15" class="edge">
+<title>27&#45;&gt;28</title>
+<path fill="none" stroke="black" d="M564.64,-1727.7C564.64,-1719.98 564.64,-1710.71 564.64,-1702.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1702.1 564.64,-1692.1 561.14,-1702.1 568.14,-1702.1"/>
+</g>
+<!-- 28&#45;&gt;29 -->
+<g id="edge17" class="edge">
+<title>28&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M564.64,-1655.7C564.64,-1647.98 564.64,-1638.71 564.64,-1630.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1630.1 564.64,-1620.1 561.14,-1630.1 568.14,-1630.1"/>
+</g>
+<!-- 30 -->
+<g id="node23" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="678.14,-1548 331.14,-1548 331.14,-1512 678.14,-1512 678.14,-1548"/>
+<text text-anchor="middle" x="504.64" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 29&#45;&gt;30 -->
+<g id="edge18" class="edge">
+<title>29&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M549.8,-1583.7C542.61,-1575.3 533.84,-1565.07 525.95,-1555.86"/>
+<polygon fill="black" stroke="black" points="528.46,-1553.42 519.3,-1548.1 523.15,-1557.97 528.46,-1553.42"/>
+</g>
+<!-- 30&#45;&gt;31 -->
+<g id="edge19" class="edge">
+<title>30&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M504.64,-1511.7C504.64,-1503.98 504.64,-1494.71 504.64,-1486.11"/>
+<polygon fill="black" stroke="black" points="508.14,-1486.1 504.64,-1476.1 501.14,-1486.1 508.14,-1486.1"/>
+</g>
+<!-- 32 -->
+<g id="node25" class="node">
+<title>32</title>
+<polygon fill="none" stroke="black" points="766.14,-1404 401.14,-1404 401.14,-1368 766.14,-1368 766.14,-1404"/>
+<text text-anchor="middle" x="583.64" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 31&#45;&gt;32 -->
+<g id="edge21" class="edge">
+<title>31&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M524.16,-1439.7C534.02,-1430.97 546.12,-1420.24 556.83,-1410.75"/>
+<polygon fill="black" stroke="black" points="559.17,-1413.36 564.33,-1404.1 554.53,-1408.12 559.17,-1413.36"/>
+</g>
+<!-- 33 -->
+<g id="node26" class="node">
+<title>33</title>
+<polygon fill="none" stroke="black" points="704.14,-1332 475.14,-1332 475.14,-1296 704.14,-1296 704.14,-1332"/>
+<text text-anchor="middle" x="589.64" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 32&#45;&gt;33 -->
+<g id="edge22" class="edge">
+<title>32&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M585.12,-1367.7C585.78,-1359.98 586.58,-1350.71 587.31,-1342.11"/>
+<polygon fill="black" stroke="black" points="590.8,-1342.37 588.17,-1332.1 583.83,-1341.77 590.8,-1342.37"/>
+</g>
+<!-- 34 -->
+<g id="node27" class="node">
+<title>34</title>
+<polygon fill="none" stroke="black" points="748.14,-1260 519.14,-1260 519.14,-1224 748.14,-1224 748.14,-1260"/>
+<text text-anchor="middle" x="633.64" y="-1238.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 1, 3, 2])</text>
+</g>
+<!-- 33&#45;&gt;34 -->
+<g id="edge23" class="edge">
+<title>33&#45;&gt;34</title>
+<path fill="none" stroke="black" d="M600.51,-1295.7C605.63,-1287.56 611.83,-1277.69 617.48,-1268.7"/>
+<polygon fill="black" stroke="black" points="620.53,-1270.43 622.88,-1260.1 614.6,-1266.71 620.53,-1270.43"/>
+</g>
+<!-- 35 -->
+<g id="node28" class="node">
+<title>35</title>
+<polygon fill="none" stroke="black" points="827.14,-1188 484.14,-1188 484.14,-1152 827.14,-1152 827.14,-1188"/>
+<text text-anchor="middle" x="655.64" y="-1166.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 64, 14], reverse=0)</text>
+</g>
+<!-- 34&#45;&gt;35 -->
+<g id="edge24" class="edge">
+<title>34&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M639.07,-1223.7C641.53,-1215.9 644.48,-1206.51 647.2,-1197.83"/>
+<polygon fill="black" stroke="black" points="650.6,-1198.69 650.26,-1188.1 643.92,-1196.59 650.6,-1198.69"/>
+</g>
+<!-- 36 -->
+<g id="node29" class="node">
+<title>36</title>
+<polygon fill="none" stroke="black" points="772.14,-1116 561.14,-1116 561.14,-1080 772.14,-1080 772.14,-1116"/>
+<text text-anchor="middle" x="666.64" y="-1094.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 35&#45;&gt;36 -->
+<g id="edge25" class="edge">
+<title>35&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M658.36,-1151.7C659.57,-1143.98 661.02,-1134.71 662.38,-1126.11"/>
+<polygon fill="black" stroke="black" points="665.85,-1126.53 663.95,-1116.1 658.94,-1125.44 665.85,-1126.53"/>
+</g>
+<!-- 36&#45;&gt;37 -->
+<g id="edge27" class="edge">
+<title>36&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M739.64,-1079.97C783.19,-1069.8 838.61,-1056.87 883.79,-1046.33"/>
+<polygon fill="black" stroke="black" points="884.62,-1049.73 893.56,-1044.05 883.03,-1042.91 884.62,-1049.73"/>
+</g>
+<!-- 38 -->
+<g id="node31" class="node">
+<title>38</title>
+<polygon fill="none" stroke="black" points="1189.14,-972 824.14,-972 824.14,-936 1189.14,-936 1189.14,-972"/>
+<text text-anchor="middle" x="1006.64" y="-950.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 14], reverse=0)</text>
+</g>
+<!-- 37&#45;&gt;38 -->
+<g id="edge28" class="edge">
+<title>37&#45;&gt;38</title>
+<path fill="none" stroke="black" d="M976.52,-1007.7C981.13,-999.64 986.7,-989.89 991.79,-980.98"/>
+<polygon fill="black" stroke="black" points="994.94,-982.52 996.86,-972.1 988.86,-979.05 994.94,-982.52"/>
+</g>
+<!-- 40 -->
+<g id="node32" class="node">
+<title>40</title>
+<polygon fill="none" stroke="black" points="1080.14,-900 973.14,-900 973.14,-864 1080.14,-864 1080.14,-900"/>
+<text text-anchor="middle" x="1026.64" y="-878.3" font-family="Times,serif" font-size="14.00">divide(·, 8.0)</text>
+</g>
+<!-- 38&#45;&gt;40 -->
+<g id="edge29" class="edge">
+<title>38&#45;&gt;40</title>
+<path fill="none" stroke="black" d="M1011.58,-935.7C1013.81,-927.9 1016.49,-918.51 1018.97,-909.83"/>
+<polygon fill="black" stroke="black" points="1022.37,-910.68 1021.75,-900.1 1015.64,-908.76 1022.37,-910.68"/>
+</g>
+<!-- 40&#45;&gt;41 -->
+<g id="edge30" class="edge">
+<title>40&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1080.5,-868.27C1086.61,-866.83 1092.76,-865.38 1098.64,-864 1161.45,-849.25 1234.08,-832.31 1279.92,-821.64"/>
+<polygon fill="black" stroke="black" points="1280.91,-825 1289.85,-819.33 1279.32,-818.18 1280.91,-825"/>
+</g>
+<!-- 42 -->
+<g id="node34" class="node">
+<title>42</title>
+<polygon fill="none" stroke="black" points="1420.64,-756 1246.64,-756 1246.64,-720 1420.64,-720 1420.64,-756"/>
+<text text-anchor="middle" x="1333.64" y="-734.3" font-family="Times,serif" font-size="14.00">nn.softmax(·| axis=&#45;1)</text>
+</g>
+<!-- 41&#45;&gt;42 -->
+<g id="edge32" class="edge">
+<title>41&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1327.61,-791.7C1328.5,-783.98 1329.55,-774.71 1330.54,-766.11"/>
+<polygon fill="black" stroke="black" points="1334.02,-766.44 1331.68,-756.1 1327.07,-765.64 1334.02,-766.44"/>
+</g>
+<!-- 43 -->
+<g id="node35" class="node">
+<title>43</title>
+<polygon fill="none" stroke="black" points="1448.14,-684 1265.14,-684 1265.14,-648 1448.14,-648 1448.14,-684"/>
+<text text-anchor="middle" x="1356.64" y="-662.3" font-family="Times,serif" font-size="14.00">nn.dropout(·| rate=0.1)</text>
+</g>
+<!-- 42&#45;&gt;43 -->
+<g id="edge33" class="edge">
+<title>42&#45;&gt;43</title>
+<path fill="none" stroke="black" d="M1339.32,-719.7C1341.88,-711.9 1344.97,-702.51 1347.82,-693.83"/>
+<polygon fill="black" stroke="black" points="1351.22,-694.7 1351.02,-684.1 1344.57,-692.51 1351.22,-694.7"/>
+</g>
+<!-- 44 -->
+<g id="node36" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="1452.64,-612 1284.64,-612 1284.64,-576 1452.64,-576 1452.64,-612"/>
+<text text-anchor="middle" x="1368.64" y="-590.3" font-family="Times,serif" font-size="14.00">TupleGetItem(idx=0)</text>
+</g>
+<!-- 43&#45;&gt;44 -->
+<g id="edge34" class="edge">
+<title>43&#45;&gt;44</title>
+<path fill="none" stroke="black" d="M1359.6,-647.7C1360.92,-639.98 1362.51,-630.71 1363.99,-622.11"/>
+<polygon fill="black" stroke="black" points="1367.46,-622.55 1365.7,-612.1 1360.56,-621.37 1367.46,-622.55"/>
+</g>
+<!-- 45 -->
+<g id="node37" class="node">
+<title>45</title>
+<polygon fill="none" stroke="black" points="1546.14,-540 1203.14,-540 1203.14,-504 1546.14,-504 1546.14,-540"/>
+<text text-anchor="middle" x="1374.64" y="-518.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 14], reverse=0)</text>
+</g>
+<!-- 44&#45;&gt;45 -->
+<g id="edge35" class="edge">
+<title>44&#45;&gt;45</title>
+<path fill="none" stroke="black" d="M1370.12,-575.7C1370.78,-567.98 1371.58,-558.71 1372.31,-550.11"/>
+<polygon fill="black" stroke="black" points="1375.8,-550.37 1373.17,-540.1 1368.83,-549.77 1375.8,-550.37"/>
+</g>
+<!-- 56 -->
+<g id="node48" class="node">
+<title>56</title>
+<polygon fill="none" stroke="black" points="1558.14,-468 1389.14,-468 1389.14,-432 1558.14,-432 1558.14,-468"/>
+<text text-anchor="middle" x="1473.64" y="-446.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 45&#45;&gt;56 -->
+<g id="edge48" class="edge">
+<title>45&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1398.85,-503.88C1411.56,-494.89 1427.3,-483.76 1441.06,-474.03"/>
+<polygon fill="black" stroke="black" points="1443.18,-476.82 1449.32,-468.19 1439.14,-471.11 1443.18,-476.82"/>
+</g>
+<!-- 47 -->
+<g id="node39" class="node">
+<title>47</title>
+<polygon fill="none" stroke="black" points="2066.14,-1764 1705.14,-1764 1705.14,-1728 2066.14,-1728 2066.14,-1764"/>
+<text text-anchor="middle" x="1885.64" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 46&#45;&gt;47 -->
+<g id="edge37" class="edge">
+<title>46&#45;&gt;47</title>
+<path fill="none" stroke="black" d="M1885.64,-1799.7C1885.64,-1791.98 1885.64,-1782.71 1885.64,-1774.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1774.1 1885.64,-1764.1 1882.14,-1774.1 1889.14,-1774.1"/>
+</g>
+<!-- 48 -->
+<g id="node40" class="node">
+<title>48</title>
+<polygon fill="none" stroke="black" points="1991.14,-1692 1780.14,-1692 1780.14,-1656 1991.14,-1656 1991.14,-1692"/>
+<text text-anchor="middle" x="1885.64" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 47&#45;&gt;48 -->
+<g id="edge38" class="edge">
+<title>47&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1885.64,-1727.7C1885.64,-1719.98 1885.64,-1710.71 1885.64,-1702.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1702.1 1885.64,-1692.1 1882.14,-1702.1 1889.14,-1702.1"/>
+</g>
+<!-- 48&#45;&gt;49 -->
+<g id="edge40" class="edge">
+<title>48&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1885.64,-1655.7C1885.64,-1647.98 1885.64,-1638.71 1885.64,-1630.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1630.1 1885.64,-1620.1 1882.14,-1630.1 1889.14,-1630.1"/>
+</g>
+<!-- 50 -->
+<g id="node42" class="node">
+<title>50</title>
+<polygon fill="none" stroke="black" points="2114.14,-1548 1767.14,-1548 1767.14,-1512 2114.14,-1512 2114.14,-1548"/>
+<text text-anchor="middle" x="1940.64" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 49&#45;&gt;50 -->
+<g id="edge41" class="edge">
+<title>49&#45;&gt;50</title>
+<path fill="none" stroke="black" d="M1899.23,-1583.7C1905.76,-1575.39 1913.7,-1565.28 1920.88,-1556.14"/>
+<polygon fill="black" stroke="black" points="1923.77,-1558.13 1927.2,-1548.1 1918.27,-1553.81 1923.77,-1558.13"/>
+</g>
+<!-- 50&#45;&gt;51 -->
+<g id="edge42" class="edge">
+<title>50&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1853.27,-1511.97C1781.96,-1498.06 1684.02,-1478.96 1627.31,-1467.91"/>
+<polygon fill="black" stroke="black" points="1627.7,-1464.42 1617.22,-1465.94 1626.36,-1471.29 1627.7,-1464.42"/>
+</g>
+<!-- 52 -->
+<g id="node44" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="1764.14,-1404 1399.14,-1404 1399.14,-1368 1764.14,-1368 1764.14,-1404"/>
+<text text-anchor="middle" x="1581.64" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 51&#45;&gt;52 -->
+<g id="edge44" class="edge">
+<title>51&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M1581.64,-1439.7C1581.64,-1431.98 1581.64,-1422.71 1581.64,-1414.11"/>
+<polygon fill="black" stroke="black" points="1585.14,-1414.1 1581.64,-1404.1 1578.14,-1414.1 1585.14,-1414.1"/>
+</g>
+<!-- 53 -->
+<g id="node45" class="node">
+<title>53</title>
+<polygon fill="none" stroke="black" points="1695.14,-1332 1466.14,-1332 1466.14,-1296 1695.14,-1296 1695.14,-1332"/>
+<text text-anchor="middle" x="1580.64" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 52&#45;&gt;53 -->
+<g id="edge45" class="edge">
+<title>52&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1581.39,-1367.7C1581.28,-1359.98 1581.15,-1350.71 1581.02,-1342.11"/>
+<polygon fill="black" stroke="black" points="1584.52,-1342.05 1580.88,-1332.1 1577.52,-1342.15 1584.52,-1342.05"/>
+</g>
+<!-- 54 -->
+<g id="node46" class="node">
+<title>54</title>
+<polygon fill="none" stroke="black" points="1751.14,-1260 1408.14,-1260 1408.14,-1224 1751.14,-1224 1751.14,-1260"/>
+<text text-anchor="middle" x="1579.64" y="-1238.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 53&#45;&gt;54 -->
+<g id="edge46" class="edge">
+<title>53&#45;&gt;54</title>
+<path fill="none" stroke="black" d="M1580.39,-1295.7C1580.28,-1287.98 1580.15,-1278.71 1580.02,-1270.11"/>
+<polygon fill="black" stroke="black" points="1583.52,-1270.05 1579.88,-1260.1 1576.52,-1270.15 1583.52,-1270.05"/>
+</g>
+<!-- 55 -->
+<g id="node47" class="node">
+<title>55</title>
+<polygon fill="none" stroke="black" points="1682.14,-1044 1471.14,-1044 1471.14,-1008 1682.14,-1008 1682.14,-1044"/>
+<text text-anchor="middle" x="1576.64" y="-1022.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 54&#45;&gt;55 -->
+<g id="edge47" class="edge">
+<title>54&#45;&gt;55</title>
+<path fill="none" stroke="black" d="M1579.4,-1223.85C1578.88,-1186.83 1577.65,-1099.18 1577.02,-1054.39"/>
+<polygon fill="black" stroke="black" points="1580.52,-1054.18 1576.88,-1044.23 1573.52,-1054.28 1580.52,-1054.18"/>
+</g>
+<!-- 55&#45;&gt;56 -->
+<g id="edge49" class="edge">
+<title>55&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1576.64,-1007.95C1576.64,-981.29 1576.64,-928.11 1576.64,-883 1576.64,-883 1576.64,-883 1576.64,-593 1576.64,-552.36 1579.06,-537.21 1555.64,-504 1546.9,-491.62 1534.27,-481.42 1521.53,-473.4"/>
+<polygon fill="black" stroke="black" points="1523.13,-470.27 1512.74,-468.19 1519.56,-476.3 1523.13,-470.27"/>
+</g>
+<!-- 57 -->
+<g id="node49" class="node">
+<title>57</title>
+<polygon fill="none" stroke="black" points="1656.14,-396 1291.14,-396 1291.14,-360 1656.14,-360 1656.14,-396"/>
+<text text-anchor="middle" x="1473.64" y="-374.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 64], reverse=0)</text>
+</g>
+<!-- 56&#45;&gt;57 -->
+<g id="edge50" class="edge">
+<title>56&#45;&gt;57</title>
+<path fill="none" stroke="black" d="M1473.64,-431.7C1473.64,-423.98 1473.64,-414.71 1473.64,-406.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-406.1 1473.64,-396.1 1470.14,-406.1 1477.14,-406.1"/>
+</g>
+<!-- 58 -->
+<g id="node50" class="node">
+<title>58</title>
+<polygon fill="none" stroke="black" points="1588.14,-324 1359.14,-324 1359.14,-288 1588.14,-288 1588.14,-324"/>
+<text text-anchor="middle" x="1473.64" y="-302.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 57&#45;&gt;58 -->
+<g id="edge51" class="edge">
+<title>57&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1473.64,-359.7C1473.64,-351.98 1473.64,-342.71 1473.64,-334.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-334.1 1473.64,-324.1 1470.14,-334.1 1477.14,-334.1"/>
+</g>
+<!-- 59 -->
+<g id="node51" class="node">
+<title>59</title>
+<polygon fill="none" stroke="black" points="1506.14,-252 1441.14,-252 1441.14,-216 1506.14,-216 1506.14,-252"/>
+<text text-anchor="middle" x="1473.64" y="-230.3" font-family="Times,serif" font-size="14.00">copy(·)</text>
+</g>
+<!-- 58&#45;&gt;59 -->
+<g id="edge52" class="edge">
+<title>58&#45;&gt;59</title>
+<path fill="none" stroke="black" d="M1473.64,-287.7C1473.64,-279.98 1473.64,-270.71 1473.64,-262.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-262.1 1473.64,-252.1 1470.14,-262.1 1477.14,-262.1"/>
+</g>
+<!-- 60 -->
+<g id="node52" class="node">
+<title>60</title>
+<polygon fill="none" stroke="black" points="1647.14,-180 1300.14,-180 1300.14,-144 1647.14,-144 1647.14,-180"/>
+<text text-anchor="middle" x="1473.64" y="-158.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 59&#45;&gt;60 -->
+<g id="edge53" class="edge">
+<title>59&#45;&gt;60</title>
+<path fill="none" stroke="black" d="M1473.64,-215.7C1473.64,-207.98 1473.64,-198.71 1473.64,-190.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-190.1 1473.64,-180.1 1470.14,-190.1 1477.14,-190.1"/>
+</g>
+<!-- 61 -->
+<g id="node53" class="node">
+<title>61</title>
+<polygon fill="none" stroke="black" points="1516.64,-108 1430.64,-108 1430.64,-72 1516.64,-72 1516.64,-108"/>
+<text text-anchor="middle" x="1473.64" y="-86.3" font-family="Times,serif" font-size="14.00">Tuple[...])</text>
+</g>
+<!-- 60&#45;&gt;61 -->
+<g id="edge54" class="edge">
+<title>60&#45;&gt;61</title>
+<path fill="none" stroke="black" d="M1473.64,-143.7C1473.64,-135.98 1473.64,-126.71 1473.64,-118.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-118.1 1473.64,-108.1 1470.14,-118.1 1477.14,-118.1"/>
+</g>
+<!-- 62 -->
+<g id="node54" class="node">
+<title>62</title>
+<polygon fill="none" stroke="black" points="1513.64,-36 1433.64,-36 1433.64,0 1513.64,0 1513.64,-36"/>
+<text text-anchor="middle" x="1473.64" y="-14.3" font-family="Times,serif" font-size="14.00">Function</text>
+</g>
+<!-- 61&#45;&gt;62 -->
+<g id="edge55" class="edge">
+<title>61&#45;&gt;62</title>
+<path fill="none" stroke="black" d="M1473.64,-71.7C1473.64,-63.98 1473.64,-54.71 1473.64,-46.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-46.1 1473.64,-36.1 1470.14,-46.1 1477.14,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert-tvm_68_0.svg b/images/bert-pytorch/bert-tvm_68_0.svg
new file mode 100644
index 0000000..4b26fbd
--- /dev/null
+++ b/images/bert-pytorch/bert-tvm_68_0.svg
@@ -0,0 +1,667 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="2122pt" height="1916pt"
+ viewBox="0.00 0.00 2122.14 1916.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1912)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1912 2118.14,-1912 2118.14,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="1461.64" cy="-1746" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="1461.64" y="-1742.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 16 -->
+<g id="node9" class="node">
+<title>16</title>
+<polygon fill="none" stroke="black" points="1620.64,-1692 1268.64,-1692 1268.64,-1656 1620.64,-1656 1620.64,-1692"/>
+<text text-anchor="middle" x="1444.64" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;16 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;16</title>
+<path fill="none" stroke="black" d="M1457.43,-1727.7C1455.54,-1719.9 1453.26,-1710.51 1451.15,-1701.83"/>
+<polygon fill="black" stroke="black" points="1454.55,-1701 1448.79,-1692.1 1447.75,-1702.65 1454.55,-1701"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="1092.64" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="1092.64" y="-1886.3" font-family="Times,serif" font-size="14.00">query.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 17 -->
+<g id="node10" class="node">
+<title>17</title>
+<polygon fill="none" stroke="black" points="1189.14,-1836 996.14,-1836 996.14,-1800 1189.14,-1800 1189.14,-1836"/>
+<text text-anchor="middle" x="1092.64" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 1&#45;&gt;17 -->
+<g id="edge2" class="edge">
+<title>1&#45;&gt;17</title>
+<path fill="none" stroke="black" d="M1092.64,-1871.7C1092.64,-1863.98 1092.64,-1854.71 1092.64,-1846.11"/>
+<polygon fill="black" stroke="black" points="1096.14,-1846.1 1092.64,-1836.1 1089.14,-1846.1 1096.14,-1846.1"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<ellipse fill="none" stroke="black" cx="863.64" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="863.64" y="-1526.3" font-family="Times,serif" font-size="14.00">query.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 22 -->
+<g id="node15" class="node">
+<title>22</title>
+<polygon fill="none" stroke="black" points="1002.14,-1476 931.14,-1476 931.14,-1440 1002.14,-1440 1002.14,-1476"/>
+<text text-anchor="middle" x="966.64" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 2&#45;&gt;22 -->
+<g id="edge9" class="edge">
+<title>2&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M888.57,-1512.05C901.88,-1503.01 918.46,-1491.74 932.9,-1481.93"/>
+<polygon fill="black" stroke="black" points="935.27,-1484.55 941.58,-1476.03 931.34,-1478.76 935.27,-1484.55"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<ellipse fill="none" stroke="black" cx="564.64" cy="-1890" rx="189.57" ry="18"/>
+<text text-anchor="middle" x="564.64" y="-1886.3" font-family="Times,serif" font-size="14.00">key.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 26 -->
+<g id="node19" class="node">
+<title>26</title>
+<polygon fill="none" stroke="black" points="661.14,-1836 468.14,-1836 468.14,-1800 661.14,-1800 661.14,-1836"/>
+<text text-anchor="middle" x="564.64" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 3&#45;&gt;26 -->
+<g id="edge13" class="edge">
+<title>3&#45;&gt;26</title>
+<path fill="none" stroke="black" d="M564.64,-1871.7C564.64,-1863.98 564.64,-1854.71 564.64,-1846.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1846.1 564.64,-1836.1 561.14,-1846.1 568.14,-1846.1"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<ellipse fill="none" stroke="black" cx="156.64" cy="-1530" rx="156.77" ry="18"/>
+<text text-anchor="middle" x="156.64" y="-1526.3" font-family="Times,serif" font-size="14.00">key.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 31 -->
+<g id="node24" class="node">
+<title>31</title>
+<polygon fill="none" stroke="black" points="540.14,-1476 469.14,-1476 469.14,-1440 540.14,-1440 540.14,-1476"/>
+<text text-anchor="middle" x="504.64" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 4&#45;&gt;31 -->
+<g id="edge20" class="edge">
+<title>4&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M230.86,-1514.07C299.97,-1500.17 400.8,-1479.89 458.85,-1468.21"/>
+<polygon fill="black" stroke="black" points="459.71,-1471.61 468.83,-1466.2 458.33,-1464.74 459.71,-1471.61"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<ellipse fill="none" stroke="black" cx="1325.64" cy="-882" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1325.64" y="-878.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 41 -->
+<g id="node33" class="node">
+<title>41</title>
+<polygon fill="none" stroke="black" points="1361.14,-828 1290.14,-828 1290.14,-792 1361.14,-792 1361.14,-828"/>
+<text text-anchor="middle" x="1325.64" y="-806.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 5&#45;&gt;41 -->
+<g id="edge31" class="edge">
+<title>5&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1325.64,-863.7C1325.64,-855.98 1325.64,-846.71 1325.64,-838.11"/>
+<polygon fill="black" stroke="black" points="1329.14,-838.1 1325.64,-828.1 1322.14,-838.1 1329.14,-838.1"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<ellipse fill="none" stroke="black" cx="1885.64" cy="-1890" rx="200.36" ry="18"/>
+<text text-anchor="middle" x="1885.64" y="-1886.3" font-family="Times,serif" font-size="14.00">value.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 46 -->
+<g id="node38" class="node">
+<title>46</title>
+<polygon fill="none" stroke="black" points="1982.14,-1836 1789.14,-1836 1789.14,-1800 1982.14,-1800 1982.14,-1836"/>
+<text text-anchor="middle" x="1885.64" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 6&#45;&gt;46 -->
+<g id="edge36" class="edge">
+<title>6&#45;&gt;46</title>
+<path fill="none" stroke="black" d="M1885.64,-1871.7C1885.64,-1863.98 1885.64,-1854.71 1885.64,-1846.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1846.1 1885.64,-1836.1 1882.14,-1846.1 1889.14,-1846.1"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<ellipse fill="none" stroke="black" cx="1581.64" cy="-1530" rx="167.07" ry="18"/>
+<text text-anchor="middle" x="1581.64" y="-1526.3" font-family="Times,serif" font-size="14.00">value.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 51 -->
+<g id="node43" class="node">
+<title>51</title>
+<polygon fill="none" stroke="black" points="1617.14,-1476 1546.14,-1476 1546.14,-1440 1617.14,-1440 1617.14,-1476"/>
+<text text-anchor="middle" x="1581.64" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 7&#45;&gt;51 -->
+<g id="edge43" class="edge">
+<title>7&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1581.64,-1511.7C1581.64,-1503.98 1581.64,-1494.71 1581.64,-1486.11"/>
+<polygon fill="black" stroke="black" points="1585.14,-1486.1 1581.64,-1476.1 1578.14,-1486.1 1585.14,-1486.1"/>
+</g>
+<!-- 20 -->
+<g id="node13" class="node">
+<title>20</title>
+<polygon fill="none" stroke="black" points="1307.14,-1620 1138.14,-1620 1138.14,-1584 1307.14,-1584 1307.14,-1620"/>
+<text text-anchor="middle" x="1222.64" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;20 -->
+<g id="edge5" class="edge">
+<title>16&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M1390.61,-1655.97C1359.19,-1646.06 1319.41,-1633.51 1286.45,-1623.12"/>
+<polygon fill="black" stroke="black" points="1287.49,-1619.78 1276.9,-1620.11 1285.39,-1626.46 1287.49,-1619.78"/>
+</g>
+<!-- 29 -->
+<g id="node22" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="649.14,-1620 480.14,-1620 480.14,-1584 649.14,-1584 649.14,-1620"/>
+<text text-anchor="middle" x="564.64" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;29 -->
+<g id="edge16" class="edge">
+<title>16&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M1268.41,-1658.98C1085.06,-1644.4 804.87,-1622.11 659.51,-1610.55"/>
+<polygon fill="black" stroke="black" points="659.53,-1607.04 649.28,-1609.73 658.97,-1614.02 659.53,-1607.04"/>
+</g>
+<!-- 49 -->
+<g id="node41" class="node">
+<title>49</title>
+<polygon fill="none" stroke="black" points="1970.14,-1620 1801.14,-1620 1801.14,-1584 1970.14,-1584 1970.14,-1620"/>
+<text text-anchor="middle" x="1885.64" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;49 -->
+<g id="edge39" class="edge">
+<title>16&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1551.95,-1655.97C1624.98,-1644.37 1720.72,-1629.18 1790.91,-1618.04"/>
+<polygon fill="black" stroke="black" points="1791.75,-1621.45 1801.08,-1616.42 1790.65,-1614.53 1791.75,-1621.45"/>
+</g>
+<!-- 18 -->
+<g id="node11" class="node">
+<title>18</title>
+<polygon fill="none" stroke="black" points="1273.14,-1764 912.14,-1764 912.14,-1728 1273.14,-1728 1273.14,-1764"/>
+<text text-anchor="middle" x="1092.64" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 17&#45;&gt;18 -->
+<g id="edge3" class="edge">
+<title>17&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M1092.64,-1799.7C1092.64,-1791.98 1092.64,-1782.71 1092.64,-1774.11"/>
+<polygon fill="black" stroke="black" points="1096.14,-1774.1 1092.64,-1764.1 1089.14,-1774.1 1096.14,-1774.1"/>
+</g>
+<!-- 19 -->
+<g id="node12" class="node">
+<title>19</title>
+<polygon fill="none" stroke="black" points="1216.14,-1692 1005.14,-1692 1005.14,-1656 1216.14,-1656 1216.14,-1692"/>
+<text text-anchor="middle" x="1110.64" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 18&#45;&gt;19 -->
+<g id="edge4" class="edge">
+<title>18&#45;&gt;19</title>
+<path fill="none" stroke="black" d="M1097.09,-1727.7C1099.09,-1719.9 1101.51,-1710.51 1103.74,-1701.83"/>
+<polygon fill="black" stroke="black" points="1107.14,-1702.66 1106.24,-1692.1 1100.36,-1700.92 1107.14,-1702.66"/>
+</g>
+<!-- 19&#45;&gt;20 -->
+<g id="edge6" class="edge">
+<title>19&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M1138.03,-1655.88C1152.54,-1646.81 1170.55,-1635.55 1186.22,-1625.76"/>
+<polygon fill="black" stroke="black" points="1188.51,-1628.46 1195.13,-1620.19 1184.8,-1622.52 1188.51,-1628.46"/>
+</g>
+<!-- 21 -->
+<g id="node14" class="node">
+<title>21</title>
+<polygon fill="none" stroke="black" points="1396.14,-1548 1049.14,-1548 1049.14,-1512 1396.14,-1512 1396.14,-1548"/>
+<text text-anchor="middle" x="1222.64" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 20&#45;&gt;21 -->
+<g id="edge7" class="edge">
+<title>20&#45;&gt;21</title>
+<path fill="none" stroke="black" d="M1222.64,-1583.7C1222.64,-1575.98 1222.64,-1566.71 1222.64,-1558.11"/>
+<polygon fill="black" stroke="black" points="1226.14,-1558.1 1222.64,-1548.1 1219.14,-1558.1 1226.14,-1558.1"/>
+</g>
+<!-- 21&#45;&gt;22 -->
+<g id="edge8" class="edge">
+<title>21&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M1160.34,-1511.97C1114.53,-1499.44 1053.33,-1482.71 1012.08,-1471.43"/>
+<polygon fill="black" stroke="black" points="1012.87,-1468.01 1002.3,-1468.75 1011.03,-1474.77 1012.87,-1468.01"/>
+</g>
+<!-- 23 -->
+<g id="node16" class="node">
+<title>23</title>
+<polygon fill="none" stroke="black" points="1149.14,-1404 784.14,-1404 784.14,-1368 1149.14,-1368 1149.14,-1404"/>
+<text text-anchor="middle" x="966.64" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 22&#45;&gt;23 -->
+<g id="edge10" class="edge">
+<title>22&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M966.64,-1439.7C966.64,-1431.98 966.64,-1422.71 966.64,-1414.11"/>
+<polygon fill="black" stroke="black" points="970.14,-1414.1 966.64,-1404.1 963.14,-1414.1 970.14,-1414.1"/>
+</g>
+<!-- 24 -->
+<g id="node17" class="node">
+<title>24</title>
+<polygon fill="none" stroke="black" points="1081.14,-1332 852.14,-1332 852.14,-1296 1081.14,-1296 1081.14,-1332"/>
+<text text-anchor="middle" x="966.64" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 23&#45;&gt;24 -->
+<g id="edge11" class="edge">
+<title>23&#45;&gt;24</title>
+<path fill="none" stroke="black" d="M966.64,-1367.7C966.64,-1359.98 966.64,-1350.71 966.64,-1342.11"/>
+<polygon fill="black" stroke="black" points="970.14,-1342.1 966.64,-1332.1 963.14,-1342.1 970.14,-1342.1"/>
+</g>
+<!-- 25 -->
+<g id="node18" class="node">
+<title>25</title>
+<polygon fill="none" stroke="black" points="1138.14,-1116 795.14,-1116 795.14,-1080 1138.14,-1080 1138.14,-1116"/>
+<text text-anchor="middle" x="966.64" y="-1094.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 24&#45;&gt;25 -->
+<g id="edge12" class="edge">
+<title>24&#45;&gt;25</title>
+<path fill="none" stroke="black" d="M966.64,-1295.85C966.64,-1258.83 966.64,-1171.18 966.64,-1126.39"/>
+<polygon fill="black" stroke="black" points="970.14,-1126.23 966.64,-1116.23 963.14,-1126.23 970.14,-1126.23"/>
+</g>
+<!-- 37 -->
+<g id="node30" class="node">
+<title>37</title>
+<polygon fill="none" stroke="black" points="1051.14,-1044 882.14,-1044 882.14,-1008 1051.14,-1008 1051.14,-1044"/>
+<text text-anchor="middle" x="966.64" y="-1022.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 25&#45;&gt;37 -->
+<g id="edge26" class="edge">
+<title>25&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M966.64,-1079.7C966.64,-1071.98 966.64,-1062.71 966.64,-1054.11"/>
+<polygon fill="black" stroke="black" points="970.14,-1054.1 966.64,-1044.1 963.14,-1054.1 970.14,-1054.1"/>
+</g>
+<!-- 27 -->
+<g id="node20" class="node">
+<title>27</title>
+<polygon fill="none" stroke="black" points="745.14,-1764 384.14,-1764 384.14,-1728 745.14,-1728 745.14,-1764"/>
+<text text-anchor="middle" x="564.64" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 26&#45;&gt;27 -->
+<g id="edge14" class="edge">
+<title>26&#45;&gt;27</title>
+<path fill="none" stroke="black" d="M564.64,-1799.7C564.64,-1791.98 564.64,-1782.71 564.64,-1774.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1774.1 564.64,-1764.1 561.14,-1774.1 568.14,-1774.1"/>
+</g>
+<!-- 28 -->
+<g id="node21" class="node">
+<title>28</title>
+<polygon fill="none" stroke="black" points="670.14,-1692 459.14,-1692 459.14,-1656 670.14,-1656 670.14,-1692"/>
+<text text-anchor="middle" x="564.64" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 27&#45;&gt;28 -->
+<g id="edge15" class="edge">
+<title>27&#45;&gt;28</title>
+<path fill="none" stroke="black" d="M564.64,-1727.7C564.64,-1719.98 564.64,-1710.71 564.64,-1702.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1702.1 564.64,-1692.1 561.14,-1702.1 568.14,-1702.1"/>
+</g>
+<!-- 28&#45;&gt;29 -->
+<g id="edge17" class="edge">
+<title>28&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M564.64,-1655.7C564.64,-1647.98 564.64,-1638.71 564.64,-1630.11"/>
+<polygon fill="black" stroke="black" points="568.14,-1630.1 564.64,-1620.1 561.14,-1630.1 568.14,-1630.1"/>
+</g>
+<!-- 30 -->
+<g id="node23" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="678.14,-1548 331.14,-1548 331.14,-1512 678.14,-1512 678.14,-1548"/>
+<text text-anchor="middle" x="504.64" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 29&#45;&gt;30 -->
+<g id="edge18" class="edge">
+<title>29&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M549.8,-1583.7C542.61,-1575.3 533.84,-1565.07 525.95,-1555.86"/>
+<polygon fill="black" stroke="black" points="528.46,-1553.42 519.3,-1548.1 523.15,-1557.97 528.46,-1553.42"/>
+</g>
+<!-- 30&#45;&gt;31 -->
+<g id="edge19" class="edge">
+<title>30&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M504.64,-1511.7C504.64,-1503.98 504.64,-1494.71 504.64,-1486.11"/>
+<polygon fill="black" stroke="black" points="508.14,-1486.1 504.64,-1476.1 501.14,-1486.1 508.14,-1486.1"/>
+</g>
+<!-- 32 -->
+<g id="node25" class="node">
+<title>32</title>
+<polygon fill="none" stroke="black" points="766.14,-1404 401.14,-1404 401.14,-1368 766.14,-1368 766.14,-1404"/>
+<text text-anchor="middle" x="583.64" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 31&#45;&gt;32 -->
+<g id="edge21" class="edge">
+<title>31&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M524.16,-1439.7C534.02,-1430.97 546.12,-1420.24 556.83,-1410.75"/>
+<polygon fill="black" stroke="black" points="559.17,-1413.36 564.33,-1404.1 554.53,-1408.12 559.17,-1413.36"/>
+</g>
+<!-- 33 -->
+<g id="node26" class="node">
+<title>33</title>
+<polygon fill="none" stroke="black" points="704.14,-1332 475.14,-1332 475.14,-1296 704.14,-1296 704.14,-1332"/>
+<text text-anchor="middle" x="589.64" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 32&#45;&gt;33 -->
+<g id="edge22" class="edge">
+<title>32&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M585.12,-1367.7C585.78,-1359.98 586.58,-1350.71 587.31,-1342.11"/>
+<polygon fill="black" stroke="black" points="590.8,-1342.37 588.17,-1332.1 583.83,-1341.77 590.8,-1342.37"/>
+</g>
+<!-- 34 -->
+<g id="node27" class="node">
+<title>34</title>
+<polygon fill="none" stroke="black" points="748.14,-1260 519.14,-1260 519.14,-1224 748.14,-1224 748.14,-1260"/>
+<text text-anchor="middle" x="633.64" y="-1238.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 1, 3, 2])</text>
+</g>
+<!-- 33&#45;&gt;34 -->
+<g id="edge23" class="edge">
+<title>33&#45;&gt;34</title>
+<path fill="none" stroke="black" d="M600.51,-1295.7C605.63,-1287.56 611.83,-1277.69 617.48,-1268.7"/>
+<polygon fill="black" stroke="black" points="620.53,-1270.43 622.88,-1260.1 614.6,-1266.71 620.53,-1270.43"/>
+</g>
+<!-- 35 -->
+<g id="node28" class="node">
+<title>35</title>
+<polygon fill="none" stroke="black" points="827.14,-1188 484.14,-1188 484.14,-1152 827.14,-1152 827.14,-1188"/>
+<text text-anchor="middle" x="655.64" y="-1166.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 64, 14], reverse=0)</text>
+</g>
+<!-- 34&#45;&gt;35 -->
+<g id="edge24" class="edge">
+<title>34&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M639.07,-1223.7C641.53,-1215.9 644.48,-1206.51 647.2,-1197.83"/>
+<polygon fill="black" stroke="black" points="650.6,-1198.69 650.26,-1188.1 643.92,-1196.59 650.6,-1198.69"/>
+</g>
+<!-- 36 -->
+<g id="node29" class="node">
+<title>36</title>
+<polygon fill="none" stroke="black" points="772.14,-1116 561.14,-1116 561.14,-1080 772.14,-1080 772.14,-1116"/>
+<text text-anchor="middle" x="666.64" y="-1094.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 35&#45;&gt;36 -->
+<g id="edge25" class="edge">
+<title>35&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M658.36,-1151.7C659.57,-1143.98 661.02,-1134.71 662.38,-1126.11"/>
+<polygon fill="black" stroke="black" points="665.85,-1126.53 663.95,-1116.1 658.94,-1125.44 665.85,-1126.53"/>
+</g>
+<!-- 36&#45;&gt;37 -->
+<g id="edge27" class="edge">
+<title>36&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M739.64,-1079.97C783.19,-1069.8 838.61,-1056.87 883.79,-1046.33"/>
+<polygon fill="black" stroke="black" points="884.62,-1049.73 893.56,-1044.05 883.03,-1042.91 884.62,-1049.73"/>
+</g>
+<!-- 38 -->
+<g id="node31" class="node">
+<title>38</title>
+<polygon fill="none" stroke="black" points="1189.14,-972 824.14,-972 824.14,-936 1189.14,-936 1189.14,-972"/>
+<text text-anchor="middle" x="1006.64" y="-950.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 14], reverse=0)</text>
+</g>
+<!-- 37&#45;&gt;38 -->
+<g id="edge28" class="edge">
+<title>37&#45;&gt;38</title>
+<path fill="none" stroke="black" d="M976.52,-1007.7C981.13,-999.64 986.7,-989.89 991.79,-980.98"/>
+<polygon fill="black" stroke="black" points="994.94,-982.52 996.86,-972.1 988.86,-979.05 994.94,-982.52"/>
+</g>
+<!-- 40 -->
+<g id="node32" class="node">
+<title>40</title>
+<polygon fill="none" stroke="black" points="1080.14,-900 973.14,-900 973.14,-864 1080.14,-864 1080.14,-900"/>
+<text text-anchor="middle" x="1026.64" y="-878.3" font-family="Times,serif" font-size="14.00">divide(·, 8.0)</text>
+</g>
+<!-- 38&#45;&gt;40 -->
+<g id="edge29" class="edge">
+<title>38&#45;&gt;40</title>
+<path fill="none" stroke="black" d="M1011.58,-935.7C1013.81,-927.9 1016.49,-918.51 1018.97,-909.83"/>
+<polygon fill="black" stroke="black" points="1022.37,-910.68 1021.75,-900.1 1015.64,-908.76 1022.37,-910.68"/>
+</g>
+<!-- 40&#45;&gt;41 -->
+<g id="edge30" class="edge">
+<title>40&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1080.5,-868.27C1086.61,-866.83 1092.76,-865.38 1098.64,-864 1161.45,-849.25 1234.08,-832.31 1279.92,-821.64"/>
+<polygon fill="black" stroke="black" points="1280.91,-825 1289.85,-819.33 1279.32,-818.18 1280.91,-825"/>
+</g>
+<!-- 42 -->
+<g id="node34" class="node">
+<title>42</title>
+<polygon fill="none" stroke="black" points="1420.64,-756 1246.64,-756 1246.64,-720 1420.64,-720 1420.64,-756"/>
+<text text-anchor="middle" x="1333.64" y="-734.3" font-family="Times,serif" font-size="14.00">nn.softmax(·| axis=&#45;1)</text>
+</g>
+<!-- 41&#45;&gt;42 -->
+<g id="edge32" class="edge">
+<title>41&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1327.61,-791.7C1328.5,-783.98 1329.55,-774.71 1330.54,-766.11"/>
+<polygon fill="black" stroke="black" points="1334.02,-766.44 1331.68,-756.1 1327.07,-765.64 1334.02,-766.44"/>
+</g>
+<!-- 43 -->
+<g id="node35" class="node">
+<title>43</title>
+<polygon fill="none" stroke="black" points="1448.14,-684 1265.14,-684 1265.14,-648 1448.14,-648 1448.14,-684"/>
+<text text-anchor="middle" x="1356.64" y="-662.3" font-family="Times,serif" font-size="14.00">nn.dropout(·| rate=0.1)</text>
+</g>
+<!-- 42&#45;&gt;43 -->
+<g id="edge33" class="edge">
+<title>42&#45;&gt;43</title>
+<path fill="none" stroke="black" d="M1339.32,-719.7C1341.88,-711.9 1344.97,-702.51 1347.82,-693.83"/>
+<polygon fill="black" stroke="black" points="1351.22,-694.7 1351.02,-684.1 1344.57,-692.51 1351.22,-694.7"/>
+</g>
+<!-- 44 -->
+<g id="node36" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="1452.64,-612 1284.64,-612 1284.64,-576 1452.64,-576 1452.64,-612"/>
+<text text-anchor="middle" x="1368.64" y="-590.3" font-family="Times,serif" font-size="14.00">TupleGetItem(idx=0)</text>
+</g>
+<!-- 43&#45;&gt;44 -->
+<g id="edge34" class="edge">
+<title>43&#45;&gt;44</title>
+<path fill="none" stroke="black" d="M1359.6,-647.7C1360.92,-639.98 1362.51,-630.71 1363.99,-622.11"/>
+<polygon fill="black" stroke="black" points="1367.46,-622.55 1365.7,-612.1 1360.56,-621.37 1367.46,-622.55"/>
+</g>
+<!-- 45 -->
+<g id="node37" class="node">
+<title>45</title>
+<polygon fill="none" stroke="black" points="1546.14,-540 1203.14,-540 1203.14,-504 1546.14,-504 1546.14,-540"/>
+<text text-anchor="middle" x="1374.64" y="-518.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 14], reverse=0)</text>
+</g>
+<!-- 44&#45;&gt;45 -->
+<g id="edge35" class="edge">
+<title>44&#45;&gt;45</title>
+<path fill="none" stroke="black" d="M1370.12,-575.7C1370.78,-567.98 1371.58,-558.71 1372.31,-550.11"/>
+<polygon fill="black" stroke="black" points="1375.8,-550.37 1373.17,-540.1 1368.83,-549.77 1375.8,-550.37"/>
+</g>
+<!-- 56 -->
+<g id="node48" class="node">
+<title>56</title>
+<polygon fill="none" stroke="black" points="1558.14,-468 1389.14,-468 1389.14,-432 1558.14,-432 1558.14,-468"/>
+<text text-anchor="middle" x="1473.64" y="-446.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 45&#45;&gt;56 -->
+<g id="edge48" class="edge">
+<title>45&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1398.85,-503.88C1411.56,-494.89 1427.3,-483.76 1441.06,-474.03"/>
+<polygon fill="black" stroke="black" points="1443.18,-476.82 1449.32,-468.19 1439.14,-471.11 1443.18,-476.82"/>
+</g>
+<!-- 47 -->
+<g id="node39" class="node">
+<title>47</title>
+<polygon fill="none" stroke="black" points="2066.14,-1764 1705.14,-1764 1705.14,-1728 2066.14,-1728 2066.14,-1764"/>
+<text text-anchor="middle" x="1885.64" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 46&#45;&gt;47 -->
+<g id="edge37" class="edge">
+<title>46&#45;&gt;47</title>
+<path fill="none" stroke="black" d="M1885.64,-1799.7C1885.64,-1791.98 1885.64,-1782.71 1885.64,-1774.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1774.1 1885.64,-1764.1 1882.14,-1774.1 1889.14,-1774.1"/>
+</g>
+<!-- 48 -->
+<g id="node40" class="node">
+<title>48</title>
+<polygon fill="none" stroke="black" points="1991.14,-1692 1780.14,-1692 1780.14,-1656 1991.14,-1656 1991.14,-1692"/>
+<text text-anchor="middle" x="1885.64" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 47&#45;&gt;48 -->
+<g id="edge38" class="edge">
+<title>47&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1885.64,-1727.7C1885.64,-1719.98 1885.64,-1710.71 1885.64,-1702.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1702.1 1885.64,-1692.1 1882.14,-1702.1 1889.14,-1702.1"/>
+</g>
+<!-- 48&#45;&gt;49 -->
+<g id="edge40" class="edge">
+<title>48&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1885.64,-1655.7C1885.64,-1647.98 1885.64,-1638.71 1885.64,-1630.11"/>
+<polygon fill="black" stroke="black" points="1889.14,-1630.1 1885.64,-1620.1 1882.14,-1630.1 1889.14,-1630.1"/>
+</g>
+<!-- 50 -->
+<g id="node42" class="node">
+<title>50</title>
+<polygon fill="none" stroke="black" points="2114.14,-1548 1767.14,-1548 1767.14,-1512 2114.14,-1512 2114.14,-1548"/>
+<text text-anchor="middle" x="1940.64" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 49&#45;&gt;50 -->
+<g id="edge41" class="edge">
+<title>49&#45;&gt;50</title>
+<path fill="none" stroke="black" d="M1899.23,-1583.7C1905.76,-1575.39 1913.7,-1565.28 1920.88,-1556.14"/>
+<polygon fill="black" stroke="black" points="1923.77,-1558.13 1927.2,-1548.1 1918.27,-1553.81 1923.77,-1558.13"/>
+</g>
+<!-- 50&#45;&gt;51 -->
+<g id="edge42" class="edge">
+<title>50&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1853.27,-1511.97C1781.96,-1498.06 1684.02,-1478.96 1627.31,-1467.91"/>
+<polygon fill="black" stroke="black" points="1627.7,-1464.42 1617.22,-1465.94 1626.36,-1471.29 1627.7,-1464.42"/>
+</g>
+<!-- 52 -->
+<g id="node44" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="1764.14,-1404 1399.14,-1404 1399.14,-1368 1764.14,-1368 1764.14,-1404"/>
+<text text-anchor="middle" x="1581.64" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 51&#45;&gt;52 -->
+<g id="edge44" class="edge">
+<title>51&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M1581.64,-1439.7C1581.64,-1431.98 1581.64,-1422.71 1581.64,-1414.11"/>
+<polygon fill="black" stroke="black" points="1585.14,-1414.1 1581.64,-1404.1 1578.14,-1414.1 1585.14,-1414.1"/>
+</g>
+<!-- 53 -->
+<g id="node45" class="node">
+<title>53</title>
+<polygon fill="none" stroke="black" points="1695.14,-1332 1466.14,-1332 1466.14,-1296 1695.14,-1296 1695.14,-1332"/>
+<text text-anchor="middle" x="1580.64" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 52&#45;&gt;53 -->
+<g id="edge45" class="edge">
+<title>52&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1581.39,-1367.7C1581.28,-1359.98 1581.15,-1350.71 1581.02,-1342.11"/>
+<polygon fill="black" stroke="black" points="1584.52,-1342.05 1580.88,-1332.1 1577.52,-1342.15 1584.52,-1342.05"/>
+</g>
+<!-- 54 -->
+<g id="node46" class="node">
+<title>54</title>
+<polygon fill="none" stroke="black" points="1751.14,-1260 1408.14,-1260 1408.14,-1224 1751.14,-1224 1751.14,-1260"/>
+<text text-anchor="middle" x="1579.64" y="-1238.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 53&#45;&gt;54 -->
+<g id="edge46" class="edge">
+<title>53&#45;&gt;54</title>
+<path fill="none" stroke="black" d="M1580.39,-1295.7C1580.28,-1287.98 1580.15,-1278.71 1580.02,-1270.11"/>
+<polygon fill="black" stroke="black" points="1583.52,-1270.05 1579.88,-1260.1 1576.52,-1270.15 1583.52,-1270.05"/>
+</g>
+<!-- 55 -->
+<g id="node47" class="node">
+<title>55</title>
+<polygon fill="none" stroke="black" points="1682.14,-1044 1471.14,-1044 1471.14,-1008 1682.14,-1008 1682.14,-1044"/>
+<text text-anchor="middle" x="1576.64" y="-1022.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 54&#45;&gt;55 -->
+<g id="edge47" class="edge">
+<title>54&#45;&gt;55</title>
+<path fill="none" stroke="black" d="M1579.4,-1223.85C1578.88,-1186.83 1577.65,-1099.18 1577.02,-1054.39"/>
+<polygon fill="black" stroke="black" points="1580.52,-1054.18 1576.88,-1044.23 1573.52,-1054.28 1580.52,-1054.18"/>
+</g>
+<!-- 55&#45;&gt;56 -->
+<g id="edge49" class="edge">
+<title>55&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1576.64,-1007.95C1576.64,-981.29 1576.64,-928.11 1576.64,-883 1576.64,-883 1576.64,-883 1576.64,-593 1576.64,-552.36 1579.06,-537.21 1555.64,-504 1546.9,-491.62 1534.27,-481.42 1521.53,-473.4"/>
+<polygon fill="black" stroke="black" points="1523.13,-470.27 1512.74,-468.19 1519.56,-476.3 1523.13,-470.27"/>
+</g>
+<!-- 57 -->
+<g id="node49" class="node">
+<title>57</title>
+<polygon fill="none" stroke="black" points="1656.14,-396 1291.14,-396 1291.14,-360 1656.14,-360 1656.14,-396"/>
+<text text-anchor="middle" x="1473.64" y="-374.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 64], reverse=0)</text>
+</g>
+<!-- 56&#45;&gt;57 -->
+<g id="edge50" class="edge">
+<title>56&#45;&gt;57</title>
+<path fill="none" stroke="black" d="M1473.64,-431.7C1473.64,-423.98 1473.64,-414.71 1473.64,-406.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-406.1 1473.64,-396.1 1470.14,-406.1 1477.14,-406.1"/>
+</g>
+<!-- 58 -->
+<g id="node50" class="node">
+<title>58</title>
+<polygon fill="none" stroke="black" points="1588.14,-324 1359.14,-324 1359.14,-288 1588.14,-288 1588.14,-324"/>
+<text text-anchor="middle" x="1473.64" y="-302.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 57&#45;&gt;58 -->
+<g id="edge51" class="edge">
+<title>57&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1473.64,-359.7C1473.64,-351.98 1473.64,-342.71 1473.64,-334.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-334.1 1473.64,-324.1 1470.14,-334.1 1477.14,-334.1"/>
+</g>
+<!-- 59 -->
+<g id="node51" class="node">
+<title>59</title>
+<polygon fill="none" stroke="black" points="1506.14,-252 1441.14,-252 1441.14,-216 1506.14,-216 1506.14,-252"/>
+<text text-anchor="middle" x="1473.64" y="-230.3" font-family="Times,serif" font-size="14.00">copy(·)</text>
+</g>
+<!-- 58&#45;&gt;59 -->
+<g id="edge52" class="edge">
+<title>58&#45;&gt;59</title>
+<path fill="none" stroke="black" d="M1473.64,-287.7C1473.64,-279.98 1473.64,-270.71 1473.64,-262.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-262.1 1473.64,-252.1 1470.14,-262.1 1477.14,-262.1"/>
+</g>
+<!-- 60 -->
+<g id="node52" class="node">
+<title>60</title>
+<polygon fill="none" stroke="black" points="1647.14,-180 1300.14,-180 1300.14,-144 1647.14,-144 1647.14,-180"/>
+<text text-anchor="middle" x="1473.64" y="-158.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 59&#45;&gt;60 -->
+<g id="edge53" class="edge">
+<title>59&#45;&gt;60</title>
+<path fill="none" stroke="black" d="M1473.64,-215.7C1473.64,-207.98 1473.64,-198.71 1473.64,-190.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-190.1 1473.64,-180.1 1470.14,-190.1 1477.14,-190.1"/>
+</g>
+<!-- 61 -->
+<g id="node53" class="node">
+<title>61</title>
+<polygon fill="none" stroke="black" points="1516.64,-108 1430.64,-108 1430.64,-72 1516.64,-72 1516.64,-108"/>
+<text text-anchor="middle" x="1473.64" y="-86.3" font-family="Times,serif" font-size="14.00">Tuple[...])</text>
+</g>
+<!-- 60&#45;&gt;61 -->
+<g id="edge54" class="edge">
+<title>60&#45;&gt;61</title>
+<path fill="none" stroke="black" d="M1473.64,-143.7C1473.64,-135.98 1473.64,-126.71 1473.64,-118.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-118.1 1473.64,-108.1 1470.14,-118.1 1477.14,-118.1"/>
+</g>
+<!-- 62 -->
+<g id="node54" class="node">
+<title>62</title>
+<polygon fill="none" stroke="black" points="1513.64,-36 1433.64,-36 1433.64,0 1513.64,0 1513.64,-36"/>
+<text text-anchor="middle" x="1473.64" y="-14.3" font-family="Times,serif" font-size="14.00">Function</text>
+</g>
+<!-- 61&#45;&gt;62 -->
+<g id="edge55" class="edge">
+<title>61&#45;&gt;62</title>
+<path fill="none" stroke="black" d="M1473.64,-71.7C1473.64,-63.98 1473.64,-54.71 1473.64,-46.11"/>
+<polygon fill="black" stroke="black" points="1477.14,-46.1 1473.64,-36.1 1470.14,-46.1 1477.14,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert-tvm_70_0.svg b/images/bert-pytorch/bert-tvm_70_0.svg
new file mode 100644
index 0000000..f015c0b
--- /dev/null
+++ b/images/bert-pytorch/bert-tvm_70_0.svg
@@ -0,0 +1,667 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="1718pt" height="1916pt"
+ viewBox="0.00 0.00 1717.50 1916.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1912)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1912 1713.5,-1912 1713.5,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="947.5" cy="-1746" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="947.5" y="-1742.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 10 -->
+<g id="node3" class="node">
+<title>10</title>
+<polygon fill="none" stroke="black" points="1089.5,-1692 737.5,-1692 737.5,-1656 1089.5,-1656 1089.5,-1692"/>
+<text text-anchor="middle" x="913.5" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;10 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;10</title>
+<path fill="none" stroke="black" d="M939.1,-1727.7C935.23,-1719.73 930.55,-1710.1 926.26,-1701.26"/>
+<polygon fill="black" stroke="black" points="929.33,-1699.57 921.81,-1692.1 923.03,-1702.63 929.33,-1699.57"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="1084.5" cy="-882" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1084.5" y="-878.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 39 -->
+<g id="node31" class="node">
+<title>39</title>
+<polygon fill="none" stroke="black" points="1120,-828 1049,-828 1049,-792 1120,-792 1120,-828"/>
+<text text-anchor="middle" x="1084.5" y="-806.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 1&#45;&gt;39 -->
+<g id="edge31" class="edge">
+<title>1&#45;&gt;39</title>
+<path fill="none" stroke="black" d="M1084.5,-863.7C1084.5,-855.98 1084.5,-846.71 1084.5,-838.11"/>
+<polygon fill="black" stroke="black" points="1088,-838.1 1084.5,-828.1 1081,-838.1 1088,-838.1"/>
+</g>
+<!-- 15 -->
+<g id="node8" class="node">
+<title>15</title>
+<polygon fill="none" stroke="black" points="834,-1620 665,-1620 665,-1584 834,-1584 834,-1620"/>
+<text text-anchor="middle" x="749.5" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 10&#45;&gt;15 -->
+<g id="edge5" class="edge">
+<title>10&#45;&gt;15</title>
+<path fill="none" stroke="black" d="M873.38,-1655.88C850.86,-1646.26 822.58,-1634.19 798.73,-1624.01"/>
+<polygon fill="black" stroke="black" points="800.06,-1620.77 789.49,-1620.07 797.31,-1627.21 800.06,-1620.77"/>
+</g>
+<!-- 26 -->
+<g id="node19" class="node">
+<title>26</title>
+<polygon fill="none" stroke="black" points="275,-1620 106,-1620 106,-1584 275,-1584 275,-1620"/>
+<text text-anchor="middle" x="190.5" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 10&#45;&gt;26 -->
+<g id="edge16" class="edge">
+<title>10&#45;&gt;26</title>
+<path fill="none" stroke="black" d="M737.56,-1655.97C595.14,-1642.18 400.01,-1623.28 285.39,-1612.19"/>
+<polygon fill="black" stroke="black" points="285.53,-1608.68 275.24,-1611.2 284.86,-1615.65 285.53,-1608.68"/>
+</g>
+<!-- 48 -->
+<g id="node40" class="node">
+<title>48</title>
+<polygon fill="none" stroke="black" points="1405,-1620 1236,-1620 1236,-1584 1405,-1584 1405,-1620"/>
+<text text-anchor="middle" x="1320.5" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 10&#45;&gt;48 -->
+<g id="edge39" class="edge">
+<title>10&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1012.54,-1655.97C1077.43,-1644.81 1161.75,-1630.3 1225.75,-1619.3"/>
+<polygon fill="black" stroke="black" points="1226.58,-1622.71 1235.84,-1617.56 1225.39,-1615.81 1226.58,-1622.71"/>
+</g>
+<!-- 11 -->
+<g id="node4" class="node">
+<title>11</title>
+<polygon fill="none" stroke="black" points="690.5,-1908 466.5,-1908 466.5,-1872 690.5,-1872 690.5,-1908"/>
+<text text-anchor="middle" x="578.5" y="-1886.3" font-family="Times,serif" font-size="14.00">Constant((768, 768), float32)</text>
+</g>
+<!-- 12 -->
+<g id="node5" class="node">
+<title>12</title>
+<polygon fill="none" stroke="black" points="675,-1836 482,-1836 482,-1800 675,-1800 675,-1836"/>
+<text text-anchor="middle" x="578.5" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 11&#45;&gt;12 -->
+<g id="edge2" class="edge">
+<title>11&#45;&gt;12</title>
+<path fill="none" stroke="black" d="M578.5,-1871.7C578.5,-1863.98 578.5,-1854.71 578.5,-1846.11"/>
+<polygon fill="black" stroke="black" points="582,-1846.1 578.5,-1836.1 575,-1846.1 582,-1846.1"/>
+</g>
+<!-- 13 -->
+<g id="node6" class="node">
+<title>13</title>
+<polygon fill="none" stroke="black" points="759,-1764 398,-1764 398,-1728 759,-1728 759,-1764"/>
+<text text-anchor="middle" x="578.5" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 12&#45;&gt;13 -->
+<g id="edge3" class="edge">
+<title>12&#45;&gt;13</title>
+<path fill="none" stroke="black" d="M578.5,-1799.7C578.5,-1791.98 578.5,-1782.71 578.5,-1774.11"/>
+<polygon fill="black" stroke="black" points="582,-1774.1 578.5,-1764.1 575,-1774.1 582,-1774.1"/>
+</g>
+<!-- 14 -->
+<g id="node7" class="node">
+<title>14</title>
+<polygon fill="none" stroke="black" points="701,-1692 490,-1692 490,-1656 701,-1656 701,-1692"/>
+<text text-anchor="middle" x="595.5" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 13&#45;&gt;14 -->
+<g id="edge4" class="edge">
+<title>13&#45;&gt;14</title>
+<path fill="none" stroke="black" d="M582.7,-1727.7C584.6,-1719.9 586.88,-1710.51 588.98,-1701.83"/>
+<polygon fill="black" stroke="black" points="592.39,-1702.65 591.35,-1692.1 585.58,-1701 592.39,-1702.65"/>
+</g>
+<!-- 14&#45;&gt;15 -->
+<g id="edge6" class="edge">
+<title>14&#45;&gt;15</title>
+<path fill="none" stroke="black" d="M633.17,-1655.88C654.13,-1646.35 680.4,-1634.41 702.68,-1624.28"/>
+<polygon fill="black" stroke="black" points="704.29,-1627.39 711.95,-1620.07 701.4,-1621.02 704.29,-1627.39"/>
+</g>
+<!-- 16 -->
+<g id="node9" class="node">
+<title>16</title>
+<polygon fill="none" stroke="black" points="923,-1548 576,-1548 576,-1512 923,-1512 923,-1548"/>
+<text text-anchor="middle" x="749.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 15&#45;&gt;16 -->
+<g id="edge7" class="edge">
+<title>15&#45;&gt;16</title>
+<path fill="none" stroke="black" d="M749.5,-1583.7C749.5,-1575.98 749.5,-1566.71 749.5,-1558.11"/>
+<polygon fill="black" stroke="black" points="753,-1558.1 749.5,-1548.1 746,-1558.1 753,-1558.1"/>
+</g>
+<!-- 18 -->
+<g id="node11" class="node">
+<title>18</title>
+<polygon fill="none" stroke="black" points="869,-1476 798,-1476 798,-1440 869,-1440 869,-1476"/>
+<text text-anchor="middle" x="833.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 16&#45;&gt;18 -->
+<g id="edge8" class="edge">
+<title>16&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M770.26,-1511.7C780.74,-1502.97 793.61,-1492.24 805,-1482.75"/>
+<polygon fill="black" stroke="black" points="807.53,-1485.19 812.97,-1476.1 803.05,-1479.82 807.53,-1485.19"/>
+</g>
+<!-- 17 -->
+<g id="node10" class="node">
+<title>17</title>
+<polygon fill="none" stroke="black" points="1133.5,-1548 941.5,-1548 941.5,-1512 1133.5,-1512 1133.5,-1548"/>
+<text text-anchor="middle" x="1037.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 17&#45;&gt;18 -->
+<g id="edge9" class="edge">
+<title>17&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M987.86,-1511.97C954.62,-1500.56 911.2,-1485.66 878.88,-1474.57"/>
+<polygon fill="black" stroke="black" points="879.83,-1471.2 869.23,-1471.26 877.55,-1477.82 879.83,-1471.2"/>
+</g>
+<!-- 19 -->
+<g id="node12" class="node">
+<title>19</title>
+<polygon fill="none" stroke="black" points="1016,-1404 651,-1404 651,-1368 1016,-1368 1016,-1404"/>
+<text text-anchor="middle" x="833.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 18&#45;&gt;19 -->
+<g id="edge10" class="edge">
+<title>18&#45;&gt;19</title>
+<path fill="none" stroke="black" d="M833.5,-1439.7C833.5,-1431.98 833.5,-1422.71 833.5,-1414.11"/>
+<polygon fill="black" stroke="black" points="837,-1414.1 833.5,-1404.1 830,-1414.1 837,-1414.1"/>
+</g>
+<!-- 20 -->
+<g id="node13" class="node">
+<title>20</title>
+<polygon fill="none" stroke="black" points="921,-1332 692,-1332 692,-1296 921,-1296 921,-1332"/>
+<text text-anchor="middle" x="806.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 19&#45;&gt;20 -->
+<g id="edge11" class="edge">
+<title>19&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M826.83,-1367.7C823.79,-1359.81 820.12,-1350.3 816.74,-1341.55"/>
+<polygon fill="black" stroke="black" points="819.96,-1340.17 813.1,-1332.1 813.43,-1342.69 819.96,-1340.17"/>
+</g>
+<!-- 21 -->
+<g id="node14" class="node">
+<title>21</title>
+<polygon fill="none" stroke="black" points="973,-1116 630,-1116 630,-1080 973,-1080 973,-1116"/>
+<text text-anchor="middle" x="801.5" y="-1094.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 20&#45;&gt;21 -->
+<g id="edge12" class="edge">
+<title>20&#45;&gt;21</title>
+<path fill="none" stroke="black" d="M806.1,-1295.85C805.23,-1258.83 803.19,-1171.18 802.14,-1126.39"/>
+<polygon fill="black" stroke="black" points="805.64,-1126.15 801.9,-1116.23 798.64,-1126.31 805.64,-1126.15"/>
+</g>
+<!-- 35 -->
+<g id="node28" class="node">
+<title>35</title>
+<polygon fill="none" stroke="black" points="880,-1044 711,-1044 711,-1008 880,-1008 880,-1044"/>
+<text text-anchor="middle" x="795.5" y="-1022.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 21&#45;&gt;35 -->
+<g id="edge26" class="edge">
+<title>21&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M800.02,-1079.7C799.36,-1071.98 798.56,-1062.71 797.82,-1054.11"/>
+<polygon fill="black" stroke="black" points="801.31,-1053.77 796.97,-1044.1 794.33,-1054.37 801.31,-1053.77"/>
+</g>
+<!-- 22 -->
+<g id="node15" class="node">
+<title>22</title>
+<polygon fill="none" stroke="black" points="302.5,-1908 78.5,-1908 78.5,-1872 302.5,-1872 302.5,-1908"/>
+<text text-anchor="middle" x="190.5" y="-1886.3" font-family="Times,serif" font-size="14.00">Constant((768, 768), float32)</text>
+</g>
+<!-- 23 -->
+<g id="node16" class="node">
+<title>23</title>
+<polygon fill="none" stroke="black" points="287,-1836 94,-1836 94,-1800 287,-1800 287,-1836"/>
+<text text-anchor="middle" x="190.5" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 22&#45;&gt;23 -->
+<g id="edge13" class="edge">
+<title>22&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M190.5,-1871.7C190.5,-1863.98 190.5,-1854.71 190.5,-1846.11"/>
+<polygon fill="black" stroke="black" points="194,-1846.1 190.5,-1836.1 187,-1846.1 194,-1846.1"/>
+</g>
+<!-- 24 -->
+<g id="node17" class="node">
+<title>24</title>
+<polygon fill="none" stroke="black" points="371,-1764 10,-1764 10,-1728 371,-1728 371,-1764"/>
+<text text-anchor="middle" x="190.5" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 23&#45;&gt;24 -->
+<g id="edge14" class="edge">
+<title>23&#45;&gt;24</title>
+<path fill="none" stroke="black" d="M190.5,-1799.7C190.5,-1791.98 190.5,-1782.71 190.5,-1774.11"/>
+<polygon fill="black" stroke="black" points="194,-1774.1 190.5,-1764.1 187,-1774.1 194,-1774.1"/>
+</g>
+<!-- 25 -->
+<g id="node18" class="node">
+<title>25</title>
+<polygon fill="none" stroke="black" points="296,-1692 85,-1692 85,-1656 296,-1656 296,-1692"/>
+<text text-anchor="middle" x="190.5" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 24&#45;&gt;25 -->
+<g id="edge15" class="edge">
+<title>24&#45;&gt;25</title>
+<path fill="none" stroke="black" d="M190.5,-1727.7C190.5,-1719.98 190.5,-1710.71 190.5,-1702.11"/>
+<polygon fill="black" stroke="black" points="194,-1702.1 190.5,-1692.1 187,-1702.1 194,-1702.1"/>
+</g>
+<!-- 25&#45;&gt;26 -->
+<g id="edge17" class="edge">
+<title>25&#45;&gt;26</title>
+<path fill="none" stroke="black" d="M190.5,-1655.7C190.5,-1647.98 190.5,-1638.71 190.5,-1630.11"/>
+<polygon fill="black" stroke="black" points="194,-1630.1 190.5,-1620.1 187,-1630.1 194,-1630.1"/>
+</g>
+<!-- 27 -->
+<g id="node20" class="node">
+<title>27</title>
+<polygon fill="none" stroke="black" points="347,-1548 0,-1548 0,-1512 347,-1512 347,-1548"/>
+<text text-anchor="middle" x="173.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 26&#45;&gt;27 -->
+<g id="edge18" class="edge">
+<title>26&#45;&gt;27</title>
+<path fill="none" stroke="black" d="M186.3,-1583.7C184.4,-1575.9 182.12,-1566.51 180.02,-1557.83"/>
+<polygon fill="black" stroke="black" points="183.42,-1557 177.65,-1548.1 176.61,-1558.65 183.42,-1557"/>
+</g>
+<!-- 29 -->
+<g id="node22" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="486,-1476 415,-1476 415,-1440 486,-1440 486,-1476"/>
+<text text-anchor="middle" x="450.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 27&#45;&gt;29 -->
+<g id="edge19" class="edge">
+<title>27&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M240.91,-1511.97C291.72,-1499.13 360.04,-1481.86 404.66,-1470.58"/>
+<polygon fill="black" stroke="black" points="405.8,-1473.91 414.64,-1468.06 404.09,-1467.12 405.8,-1473.91"/>
+</g>
+<!-- 28 -->
+<g id="node21" class="node">
+<title>28</title>
+<polygon fill="none" stroke="black" points="557.5,-1548 365.5,-1548 365.5,-1512 557.5,-1512 557.5,-1548"/>
+<text text-anchor="middle" x="461.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 28&#45;&gt;29 -->
+<g id="edge20" class="edge">
+<title>28&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M458.78,-1511.7C457.57,-1503.98 456.11,-1494.71 454.76,-1486.11"/>
+<polygon fill="black" stroke="black" points="458.2,-1485.44 453.19,-1476.1 451.28,-1486.53 458.2,-1485.44"/>
+</g>
+<!-- 30 -->
+<g id="node23" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="633,-1404 268,-1404 268,-1368 633,-1368 633,-1404"/>
+<text text-anchor="middle" x="450.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 29&#45;&gt;30 -->
+<g id="edge21" class="edge">
+<title>29&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M450.5,-1439.7C450.5,-1431.98 450.5,-1422.71 450.5,-1414.11"/>
+<polygon fill="black" stroke="black" points="454,-1414.1 450.5,-1404.1 447,-1414.1 454,-1414.1"/>
+</g>
+<!-- 31 -->
+<g id="node24" class="node">
+<title>31</title>
+<polygon fill="none" stroke="black" points="566,-1332 337,-1332 337,-1296 566,-1296 566,-1332"/>
+<text text-anchor="middle" x="451.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 30&#45;&gt;31 -->
+<g id="edge22" class="edge">
+<title>30&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M450.75,-1367.7C450.86,-1359.98 450.99,-1350.71 451.11,-1342.11"/>
+<polygon fill="black" stroke="black" points="454.61,-1342.15 451.26,-1332.1 447.61,-1342.05 454.61,-1342.15"/>
+</g>
+<!-- 32 -->
+<g id="node25" class="node">
+<title>32</title>
+<polygon fill="none" stroke="black" points="596,-1260 367,-1260 367,-1224 596,-1224 596,-1260"/>
+<text text-anchor="middle" x="481.5" y="-1238.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 1, 3, 2])</text>
+</g>
+<!-- 31&#45;&gt;32 -->
+<g id="edge23" class="edge">
+<title>31&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M458.92,-1295.7C462.29,-1287.81 466.37,-1278.3 470.12,-1269.55"/>
+<polygon fill="black" stroke="black" points="473.45,-1270.67 474.17,-1260.1 467.01,-1267.92 473.45,-1270.67"/>
+</g>
+<!-- 33 -->
+<g id="node26" class="node">
+<title>33</title>
+<polygon fill="none" stroke="black" points="668,-1188 325,-1188 325,-1152 668,-1152 668,-1188"/>
+<text text-anchor="middle" x="496.5" y="-1166.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 64, 14], reverse=0)</text>
+</g>
+<!-- 32&#45;&gt;33 -->
+<g id="edge24" class="edge">
+<title>32&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M485.21,-1223.7C486.86,-1215.98 488.85,-1206.71 490.69,-1198.11"/>
+<polygon fill="black" stroke="black" points="494.16,-1198.62 492.83,-1188.1 487.32,-1197.15 494.16,-1198.62"/>
+</g>
+<!-- 34 -->
+<g id="node27" class="node">
+<title>34</title>
+<polygon fill="none" stroke="black" points="610,-1116 399,-1116 399,-1080 610,-1080 610,-1116"/>
+<text text-anchor="middle" x="504.5" y="-1094.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 33&#45;&gt;34 -->
+<g id="edge25" class="edge">
+<title>33&#45;&gt;34</title>
+<path fill="none" stroke="black" d="M498.48,-1151.7C499.36,-1143.98 500.42,-1134.71 501.4,-1126.11"/>
+<polygon fill="black" stroke="black" points="504.89,-1126.44 502.55,-1116.1 497.93,-1125.64 504.89,-1126.44"/>
+</g>
+<!-- 34&#45;&gt;35 -->
+<g id="edge27" class="edge">
+<title>34&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M575.31,-1079.97C617.47,-1069.83 671.09,-1056.93 714.87,-1046.4"/>
+<polygon fill="black" stroke="black" points="715.72,-1049.79 724.62,-1044.05 714.08,-1042.99 715.72,-1049.79"/>
+</g>
+<!-- 36 -->
+<g id="node29" class="node">
+<title>36</title>
+<polygon fill="none" stroke="black" points="978,-972 613,-972 613,-936 978,-936 978,-972"/>
+<text text-anchor="middle" x="795.5" y="-950.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 14], reverse=0)</text>
+</g>
+<!-- 35&#45;&gt;36 -->
+<g id="edge28" class="edge">
+<title>35&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M795.5,-1007.7C795.5,-999.98 795.5,-990.71 795.5,-982.11"/>
+<polygon fill="black" stroke="black" points="799,-982.1 795.5,-972.1 792,-982.1 799,-982.1"/>
+</g>
+<!-- 38 -->
+<g id="node30" class="node">
+<title>38</title>
+<polygon fill="none" stroke="black" points="849,-900 742,-900 742,-864 849,-864 849,-900"/>
+<text text-anchor="middle" x="795.5" y="-878.3" font-family="Times,serif" font-size="14.00">divide(·, 8.0)</text>
+</g>
+<!-- 36&#45;&gt;38 -->
+<g id="edge29" class="edge">
+<title>36&#45;&gt;38</title>
+<path fill="none" stroke="black" d="M795.5,-935.7C795.5,-927.98 795.5,-918.71 795.5,-910.11"/>
+<polygon fill="black" stroke="black" points="799,-910.1 795.5,-900.1 792,-910.1 799,-910.1"/>
+</g>
+<!-- 38&#45;&gt;39 -->
+<g id="edge30" class="edge">
+<title>38&#45;&gt;39</title>
+<path fill="none" stroke="black" d="M849.24,-866.13C852.03,-865.4 854.8,-864.69 857.5,-864 920.04,-848.13 992.73,-831.47 1038.67,-821.16"/>
+<polygon fill="black" stroke="black" points="1039.63,-824.53 1048.62,-818.93 1038.1,-817.7 1039.63,-824.53"/>
+</g>
+<!-- 40 -->
+<g id="node32" class="node">
+<title>40</title>
+<polygon fill="none" stroke="black" points="1194.5,-756 1020.5,-756 1020.5,-720 1194.5,-720 1194.5,-756"/>
+<text text-anchor="middle" x="1107.5" y="-734.3" font-family="Times,serif" font-size="14.00">nn.softmax(·| axis=&#45;1)</text>
+</g>
+<!-- 39&#45;&gt;40 -->
+<g id="edge32" class="edge">
+<title>39&#45;&gt;40</title>
+<path fill="none" stroke="black" d="M1090.19,-791.7C1092.75,-783.9 1095.83,-774.51 1098.68,-765.83"/>
+<polygon fill="black" stroke="black" points="1102.08,-766.7 1101.88,-756.1 1095.43,-764.51 1102.08,-766.7"/>
+</g>
+<!-- 41 -->
+<g id="node33" class="node">
+<title>41</title>
+<polygon fill="none" stroke="black" points="1211,-684 1028,-684 1028,-648 1211,-648 1211,-684"/>
+<text text-anchor="middle" x="1119.5" y="-662.3" font-family="Times,serif" font-size="14.00">nn.dropout(·| rate=0.1)</text>
+</g>
+<!-- 40&#45;&gt;41 -->
+<g id="edge33" class="edge">
+<title>40&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1110.47,-719.7C1111.79,-711.98 1113.38,-702.71 1114.85,-694.11"/>
+<polygon fill="black" stroke="black" points="1118.33,-694.55 1116.57,-684.1 1111.43,-693.37 1118.33,-694.55"/>
+</g>
+<!-- 42 -->
+<g id="node34" class="node">
+<title>42</title>
+<polygon fill="none" stroke="black" points="1209.5,-612 1041.5,-612 1041.5,-576 1209.5,-576 1209.5,-612"/>
+<text text-anchor="middle" x="1125.5" y="-590.3" font-family="Times,serif" font-size="14.00">TupleGetItem(idx=0)</text>
+</g>
+<!-- 41&#45;&gt;42 -->
+<g id="edge34" class="edge">
+<title>41&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1120.98,-647.7C1121.64,-639.98 1122.44,-630.71 1123.18,-622.11"/>
+<polygon fill="black" stroke="black" points="1126.67,-622.37 1124.03,-612.1 1119.69,-621.77 1126.67,-622.37"/>
+</g>
+<!-- 43 -->
+<g id="node35" class="node">
+<title>43</title>
+<polygon fill="none" stroke="black" points="1300,-540 957,-540 957,-504 1300,-504 1300,-540"/>
+<text text-anchor="middle" x="1128.5" y="-518.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 14], reverse=0)</text>
+</g>
+<!-- 42&#45;&gt;43 -->
+<g id="edge35" class="edge">
+<title>42&#45;&gt;43</title>
+<path fill="none" stroke="black" d="M1126.24,-575.7C1126.57,-567.98 1126.97,-558.71 1127.34,-550.11"/>
+<polygon fill="black" stroke="black" points="1130.84,-550.25 1127.77,-540.1 1123.84,-549.95 1130.84,-550.25"/>
+</g>
+<!-- 56 -->
+<g id="node48" class="node">
+<title>56</title>
+<polygon fill="none" stroke="black" points="1312,-468 1143,-468 1143,-432 1312,-432 1312,-468"/>
+<text text-anchor="middle" x="1227.5" y="-446.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 43&#45;&gt;56 -->
+<g id="edge48" class="edge">
+<title>43&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1152.72,-503.88C1165.42,-494.89 1181.16,-483.76 1194.92,-474.03"/>
+<polygon fill="black" stroke="black" points="1197.04,-476.82 1203.19,-468.19 1193,-471.11 1197.04,-476.82"/>
+</g>
+<!-- 44 -->
+<g id="node36" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="1432.5,-1908 1208.5,-1908 1208.5,-1872 1432.5,-1872 1432.5,-1908"/>
+<text text-anchor="middle" x="1320.5" y="-1886.3" font-family="Times,serif" font-size="14.00">Constant((768, 768), float32)</text>
+</g>
+<!-- 45 -->
+<g id="node37" class="node">
+<title>45</title>
+<polygon fill="none" stroke="black" points="1417,-1836 1224,-1836 1224,-1800 1417,-1800 1417,-1836"/>
+<text text-anchor="middle" x="1320.5" y="-1814.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 44&#45;&gt;45 -->
+<g id="edge36" class="edge">
+<title>44&#45;&gt;45</title>
+<path fill="none" stroke="black" d="M1320.5,-1871.7C1320.5,-1863.98 1320.5,-1854.71 1320.5,-1846.11"/>
+<polygon fill="black" stroke="black" points="1324,-1846.1 1320.5,-1836.1 1317,-1846.1 1324,-1846.1"/>
+</g>
+<!-- 46 -->
+<g id="node38" class="node">
+<title>46</title>
+<polygon fill="none" stroke="black" points="1501,-1764 1140,-1764 1140,-1728 1501,-1728 1501,-1764"/>
+<text text-anchor="middle" x="1320.5" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 768, 768], reverse=0)</text>
+</g>
+<!-- 45&#45;&gt;46 -->
+<g id="edge37" class="edge">
+<title>45&#45;&gt;46</title>
+<path fill="none" stroke="black" d="M1320.5,-1799.7C1320.5,-1791.98 1320.5,-1782.71 1320.5,-1774.11"/>
+<polygon fill="black" stroke="black" points="1324,-1774.1 1320.5,-1764.1 1317,-1774.1 1324,-1774.1"/>
+</g>
+<!-- 47 -->
+<g id="node39" class="node">
+<title>47</title>
+<polygon fill="none" stroke="black" points="1426,-1692 1215,-1692 1215,-1656 1426,-1656 1426,-1692"/>
+<text text-anchor="middle" x="1320.5" y="-1670.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 46&#45;&gt;47 -->
+<g id="edge38" class="edge">
+<title>46&#45;&gt;47</title>
+<path fill="none" stroke="black" d="M1320.5,-1727.7C1320.5,-1719.98 1320.5,-1710.71 1320.5,-1702.11"/>
+<polygon fill="black" stroke="black" points="1324,-1702.1 1320.5,-1692.1 1317,-1702.1 1324,-1702.1"/>
+</g>
+<!-- 47&#45;&gt;48 -->
+<g id="edge40" class="edge">
+<title>47&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1320.5,-1655.7C1320.5,-1647.98 1320.5,-1638.71 1320.5,-1630.11"/>
+<polygon fill="black" stroke="black" points="1324,-1630.1 1320.5,-1620.1 1317,-1630.1 1324,-1630.1"/>
+</g>
+<!-- 49 -->
+<g id="node41" class="node">
+<title>49</title>
+<polygon fill="none" stroke="black" points="1499,-1548 1152,-1548 1152,-1512 1499,-1512 1499,-1548"/>
+<text text-anchor="middle" x="1325.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 48&#45;&gt;49 -->
+<g id="edge41" class="edge">
+<title>48&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1321.74,-1583.7C1322.29,-1575.98 1322.95,-1566.71 1323.56,-1558.11"/>
+<polygon fill="black" stroke="black" points="1327.06,-1558.33 1324.28,-1548.1 1320.07,-1557.83 1327.06,-1558.33"/>
+</g>
+<!-- 51 -->
+<g id="node43" class="node">
+<title>51</title>
+<polygon fill="none" stroke="black" points="1366,-1476 1295,-1476 1295,-1440 1366,-1440 1366,-1476"/>
+<text text-anchor="middle" x="1330.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 49&#45;&gt;51 -->
+<g id="edge42" class="edge">
+<title>49&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1326.74,-1511.7C1327.29,-1503.98 1327.95,-1494.71 1328.56,-1486.11"/>
+<polygon fill="black" stroke="black" points="1332.06,-1486.33 1329.28,-1476.1 1325.07,-1485.83 1332.06,-1486.33"/>
+</g>
+<!-- 50 -->
+<g id="node42" class="node">
+<title>50</title>
+<polygon fill="none" stroke="black" points="1709.5,-1548 1517.5,-1548 1517.5,-1512 1709.5,-1512 1709.5,-1548"/>
+<text text-anchor="middle" x="1613.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 50&#45;&gt;51 -->
+<g id="edge43" class="edge">
+<title>50&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1544.63,-1511.97C1492.31,-1499.02 1421.8,-1481.58 1376.25,-1470.32"/>
+<polygon fill="black" stroke="black" points="1376.9,-1466.87 1366.36,-1467.87 1375.22,-1473.67 1376.9,-1466.87"/>
+</g>
+<!-- 52 -->
+<g id="node44" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="1513,-1404 1148,-1404 1148,-1368 1513,-1368 1513,-1404"/>
+<text text-anchor="middle" x="1330.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 51&#45;&gt;52 -->
+<g id="edge44" class="edge">
+<title>51&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M1330.5,-1439.7C1330.5,-1431.98 1330.5,-1422.71 1330.5,-1414.11"/>
+<polygon fill="black" stroke="black" points="1334,-1414.1 1330.5,-1404.1 1327,-1414.1 1334,-1414.1"/>
+</g>
+<!-- 53 -->
+<g id="node45" class="node">
+<title>53</title>
+<polygon fill="none" stroke="black" points="1445,-1332 1216,-1332 1216,-1296 1445,-1296 1445,-1332"/>
+<text text-anchor="middle" x="1330.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 52&#45;&gt;53 -->
+<g id="edge45" class="edge">
+<title>52&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1330.5,-1367.7C1330.5,-1359.98 1330.5,-1350.71 1330.5,-1342.11"/>
+<polygon fill="black" stroke="black" points="1334,-1342.1 1330.5,-1332.1 1327,-1342.1 1334,-1342.1"/>
+</g>
+<!-- 54 -->
+<g id="node46" class="node">
+<title>54</title>
+<polygon fill="none" stroke="black" points="1502,-1260 1159,-1260 1159,-1224 1502,-1224 1502,-1260"/>
+<text text-anchor="middle" x="1330.5" y="-1238.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 53&#45;&gt;54 -->
+<g id="edge46" class="edge">
+<title>53&#45;&gt;54</title>
+<path fill="none" stroke="black" d="M1330.5,-1295.7C1330.5,-1287.98 1330.5,-1278.71 1330.5,-1270.11"/>
+<polygon fill="black" stroke="black" points="1334,-1270.1 1330.5,-1260.1 1327,-1270.1 1334,-1270.1"/>
+</g>
+<!-- 55 -->
+<g id="node47" class="node">
+<title>55</title>
+<polygon fill="none" stroke="black" points="1436,-1044 1225,-1044 1225,-1008 1436,-1008 1436,-1044"/>
+<text text-anchor="middle" x="1330.5" y="-1022.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 54&#45;&gt;55 -->
+<g id="edge47" class="edge">
+<title>54&#45;&gt;55</title>
+<path fill="none" stroke="black" d="M1330.5,-1223.85C1330.5,-1186.83 1330.5,-1099.18 1330.5,-1054.39"/>
+<polygon fill="black" stroke="black" points="1334,-1054.23 1330.5,-1044.23 1327,-1054.23 1334,-1054.23"/>
+</g>
+<!-- 55&#45;&gt;56 -->
+<g id="edge49" class="edge">
+<title>55&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1330.5,-1007.95C1330.5,-981.29 1330.5,-928.11 1330.5,-883 1330.5,-883 1330.5,-883 1330.5,-593 1330.5,-552.36 1332.93,-537.21 1309.5,-504 1300.77,-491.62 1288.14,-481.42 1275.4,-473.4"/>
+<polygon fill="black" stroke="black" points="1276.99,-470.27 1266.6,-468.19 1273.42,-476.3 1276.99,-470.27"/>
+</g>
+<!-- 57 -->
+<g id="node49" class="node">
+<title>57</title>
+<polygon fill="none" stroke="black" points="1410,-396 1045,-396 1045,-360 1410,-360 1410,-396"/>
+<text text-anchor="middle" x="1227.5" y="-374.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 64], reverse=0)</text>
+</g>
+<!-- 56&#45;&gt;57 -->
+<g id="edge50" class="edge">
+<title>56&#45;&gt;57</title>
+<path fill="none" stroke="black" d="M1227.5,-431.7C1227.5,-423.98 1227.5,-414.71 1227.5,-406.11"/>
+<polygon fill="black" stroke="black" points="1231,-406.1 1227.5,-396.1 1224,-406.1 1231,-406.1"/>
+</g>
+<!-- 58 -->
+<g id="node50" class="node">
+<title>58</title>
+<polygon fill="none" stroke="black" points="1342,-324 1113,-324 1113,-288 1342,-288 1342,-324"/>
+<text text-anchor="middle" x="1227.5" y="-302.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 57&#45;&gt;58 -->
+<g id="edge51" class="edge">
+<title>57&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1227.5,-359.7C1227.5,-351.98 1227.5,-342.71 1227.5,-334.11"/>
+<polygon fill="black" stroke="black" points="1231,-334.1 1227.5,-324.1 1224,-334.1 1231,-334.1"/>
+</g>
+<!-- 59 -->
+<g id="node51" class="node">
+<title>59</title>
+<polygon fill="none" stroke="black" points="1260,-252 1195,-252 1195,-216 1260,-216 1260,-252"/>
+<text text-anchor="middle" x="1227.5" y="-230.3" font-family="Times,serif" font-size="14.00">copy(·)</text>
+</g>
+<!-- 58&#45;&gt;59 -->
+<g id="edge52" class="edge">
+<title>58&#45;&gt;59</title>
+<path fill="none" stroke="black" d="M1227.5,-287.7C1227.5,-279.98 1227.5,-270.71 1227.5,-262.11"/>
+<polygon fill="black" stroke="black" points="1231,-262.1 1227.5,-252.1 1224,-262.1 1231,-262.1"/>
+</g>
+<!-- 60 -->
+<g id="node52" class="node">
+<title>60</title>
+<polygon fill="none" stroke="black" points="1401,-180 1054,-180 1054,-144 1401,-144 1401,-180"/>
+<text text-anchor="middle" x="1227.5" y="-158.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 59&#45;&gt;60 -->
+<g id="edge53" class="edge">
+<title>59&#45;&gt;60</title>
+<path fill="none" stroke="black" d="M1227.5,-215.7C1227.5,-207.98 1227.5,-198.71 1227.5,-190.11"/>
+<polygon fill="black" stroke="black" points="1231,-190.1 1227.5,-180.1 1224,-190.1 1231,-190.1"/>
+</g>
+<!-- 61 -->
+<g id="node53" class="node">
+<title>61</title>
+<polygon fill="none" stroke="black" points="1270.5,-108 1184.5,-108 1184.5,-72 1270.5,-72 1270.5,-108"/>
+<text text-anchor="middle" x="1227.5" y="-86.3" font-family="Times,serif" font-size="14.00">Tuple[...])</text>
+</g>
+<!-- 60&#45;&gt;61 -->
+<g id="edge54" class="edge">
+<title>60&#45;&gt;61</title>
+<path fill="none" stroke="black" d="M1227.5,-143.7C1227.5,-135.98 1227.5,-126.71 1227.5,-118.11"/>
+<polygon fill="black" stroke="black" points="1231,-118.1 1227.5,-108.1 1224,-118.1 1231,-118.1"/>
+</g>
+<!-- 62 -->
+<g id="node54" class="node">
+<title>62</title>
+<polygon fill="none" stroke="black" points="1267.5,-36 1187.5,-36 1187.5,0 1267.5,0 1267.5,-36"/>
+<text text-anchor="middle" x="1227.5" y="-14.3" font-family="Times,serif" font-size="14.00">Function</text>
+</g>
+<!-- 61&#45;&gt;62 -->
+<g id="edge55" class="edge">
+<title>61&#45;&gt;62</title>
+<path fill="none" stroke="black" d="M1227.5,-71.7C1227.5,-63.98 1227.5,-54.71 1227.5,-46.11"/>
+<polygon fill="black" stroke="black" points="1231,-46.1 1227.5,-36.1 1224,-46.1 1231,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert-tvm_72_0.svg b/images/bert-pytorch/bert-tvm_72_0.svg
new file mode 100644
index 0000000..f8b6dca
--- /dev/null
+++ b/images/bert-pytorch/bert-tvm_72_0.svg
@@ -0,0 +1,559 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="1718pt" height="1772pt"
+ viewBox="0.00 0.00 1717.50 1772.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1768)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1768 1713.5,-1768 1713.5,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="814.5" cy="-1746" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="814.5" y="-1742.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 10 -->
+<g id="node3" class="node">
+<title>10</title>
+<polygon fill="none" stroke="black" points="990.5,-1692 638.5,-1692 638.5,-1656 990.5,-1656 990.5,-1692"/>
+<text text-anchor="middle" x="814.5" y="-1670.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;10 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;10</title>
+<path fill="none" stroke="black" d="M814.5,-1727.7C814.5,-1719.98 814.5,-1710.71 814.5,-1702.11"/>
+<polygon fill="black" stroke="black" points="818,-1702.1 814.5,-1692.1 811,-1702.1 818,-1702.1"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="1084.5" cy="-882" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1084.5" y="-878.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 33 -->
+<g id="node25" class="node">
+<title>33</title>
+<polygon fill="none" stroke="black" points="1120,-828 1049,-828 1049,-792 1120,-792 1120,-828"/>
+<text text-anchor="middle" x="1084.5" y="-806.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 1&#45;&gt;33 -->
+<g id="edge25" class="edge">
+<title>1&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M1084.5,-863.7C1084.5,-855.98 1084.5,-846.71 1084.5,-838.11"/>
+<polygon fill="black" stroke="black" points="1088,-838.1 1084.5,-828.1 1081,-838.1 1088,-838.1"/>
+</g>
+<!-- 12 -->
+<g id="node5" class="node">
+<title>12</title>
+<polygon fill="none" stroke="black" points="834,-1620 665,-1620 665,-1584 834,-1584 834,-1620"/>
+<text text-anchor="middle" x="749.5" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 10&#45;&gt;12 -->
+<g id="edge2" class="edge">
+<title>10&#45;&gt;12</title>
+<path fill="none" stroke="black" d="M798.43,-1655.7C790.56,-1647.22 780.94,-1636.86 772.33,-1627.58"/>
+<polygon fill="black" stroke="black" points="774.75,-1625.05 765.38,-1620.1 769.62,-1629.81 774.75,-1625.05"/>
+</g>
+<!-- 20 -->
+<g id="node13" class="node">
+<title>20</title>
+<polygon fill="none" stroke="black" points="324,-1620 155,-1620 155,-1584 324,-1584 324,-1620"/>
+<text text-anchor="middle" x="239.5" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 10&#45;&gt;20 -->
+<g id="edge10" class="edge">
+<title>10&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M674.57,-1655.97C569,-1643.11 426.99,-1625.83 334.38,-1614.55"/>
+<polygon fill="black" stroke="black" points="334.53,-1611.04 324.18,-1613.31 333.68,-1617.99 334.53,-1611.04"/>
+</g>
+<!-- 39 -->
+<g id="node31" class="node">
+<title>39</title>
+<polygon fill="none" stroke="black" points="1344,-1620 1175,-1620 1175,-1584 1344,-1584 1344,-1620"/>
+<text text-anchor="middle" x="1259.5" y="-1598.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 10&#45;&gt;39 -->
+<g id="edge30" class="edge">
+<title>10&#45;&gt;39</title>
+<path fill="none" stroke="black" d="M922.79,-1655.97C996.72,-1644.34 1093.73,-1629.08 1164.62,-1617.92"/>
+<polygon fill="black" stroke="black" points="1165.55,-1621.32 1174.89,-1616.31 1164.47,-1614.41 1165.55,-1621.32"/>
+</g>
+<!-- 11 -->
+<g id="node4" class="node">
+<title>11</title>
+<polygon fill="none" stroke="black" points="620.5,-1692 378.5,-1692 378.5,-1656 620.5,-1656 620.5,-1692"/>
+<text text-anchor="middle" x="499.5" y="-1670.3" font-family="Times,serif" font-size="14.00">Constant((1, 768, 768), float32)</text>
+</g>
+<!-- 11&#45;&gt;12 -->
+<g id="edge3" class="edge">
+<title>11&#45;&gt;12</title>
+<path fill="none" stroke="black" d="M560.34,-1655.97C596.03,-1645.97 641.29,-1633.3 678.59,-1622.85"/>
+<polygon fill="black" stroke="black" points="679.7,-1626.18 688.39,-1620.11 677.82,-1619.44 679.7,-1626.18"/>
+</g>
+<!-- 13 -->
+<g id="node6" class="node">
+<title>13</title>
+<polygon fill="none" stroke="black" points="923,-1548 576,-1548 576,-1512 923,-1512 923,-1548"/>
+<text text-anchor="middle" x="749.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 12&#45;&gt;13 -->
+<g id="edge4" class="edge">
+<title>12&#45;&gt;13</title>
+<path fill="none" stroke="black" d="M749.5,-1583.7C749.5,-1575.98 749.5,-1566.71 749.5,-1558.11"/>
+<polygon fill="black" stroke="black" points="753,-1558.1 749.5,-1548.1 746,-1558.1 753,-1558.1"/>
+</g>
+<!-- 15 -->
+<g id="node8" class="node">
+<title>15</title>
+<polygon fill="none" stroke="black" points="869,-1476 798,-1476 798,-1440 869,-1440 869,-1476"/>
+<text text-anchor="middle" x="833.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 13&#45;&gt;15 -->
+<g id="edge5" class="edge">
+<title>13&#45;&gt;15</title>
+<path fill="none" stroke="black" d="M770.26,-1511.7C780.74,-1502.97 793.61,-1492.24 805,-1482.75"/>
+<polygon fill="black" stroke="black" points="807.53,-1485.19 812.97,-1476.1 803.05,-1479.82 807.53,-1485.19"/>
+</g>
+<!-- 14 -->
+<g id="node7" class="node">
+<title>14</title>
+<polygon fill="none" stroke="black" points="1133.5,-1548 941.5,-1548 941.5,-1512 1133.5,-1512 1133.5,-1548"/>
+<text text-anchor="middle" x="1037.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 14&#45;&gt;15 -->
+<g id="edge6" class="edge">
+<title>14&#45;&gt;15</title>
+<path fill="none" stroke="black" d="M987.86,-1511.97C954.62,-1500.56 911.2,-1485.66 878.88,-1474.57"/>
+<polygon fill="black" stroke="black" points="879.83,-1471.2 869.23,-1471.26 877.55,-1477.82 879.83,-1471.2"/>
+</g>
+<!-- 16 -->
+<g id="node9" class="node">
+<title>16</title>
+<polygon fill="none" stroke="black" points="1016,-1404 651,-1404 651,-1368 1016,-1368 1016,-1404"/>
+<text text-anchor="middle" x="833.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 15&#45;&gt;16 -->
+<g id="edge7" class="edge">
+<title>15&#45;&gt;16</title>
+<path fill="none" stroke="black" d="M833.5,-1439.7C833.5,-1431.98 833.5,-1422.71 833.5,-1414.11"/>
+<polygon fill="black" stroke="black" points="837,-1414.1 833.5,-1404.1 830,-1414.1 837,-1414.1"/>
+</g>
+<!-- 17 -->
+<g id="node10" class="node">
+<title>17</title>
+<polygon fill="none" stroke="black" points="921,-1332 692,-1332 692,-1296 921,-1296 921,-1332"/>
+<text text-anchor="middle" x="806.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 16&#45;&gt;17 -->
+<g id="edge8" class="edge">
+<title>16&#45;&gt;17</title>
+<path fill="none" stroke="black" d="M826.83,-1367.7C823.79,-1359.81 820.12,-1350.3 816.74,-1341.55"/>
+<polygon fill="black" stroke="black" points="819.96,-1340.17 813.1,-1332.1 813.43,-1342.69 819.96,-1340.17"/>
+</g>
+<!-- 18 -->
+<g id="node11" class="node">
+<title>18</title>
+<polygon fill="none" stroke="black" points="973,-1116 630,-1116 630,-1080 973,-1080 973,-1116"/>
+<text text-anchor="middle" x="801.5" y="-1094.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 17&#45;&gt;18 -->
+<g id="edge9" class="edge">
+<title>17&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M806.1,-1295.85C805.23,-1258.83 803.19,-1171.18 802.14,-1126.39"/>
+<polygon fill="black" stroke="black" points="805.64,-1126.15 801.9,-1116.23 798.64,-1126.31 805.64,-1126.15"/>
+</g>
+<!-- 29 -->
+<g id="node22" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="880,-1044 711,-1044 711,-1008 880,-1008 880,-1044"/>
+<text text-anchor="middle" x="795.5" y="-1022.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 18&#45;&gt;29 -->
+<g id="edge20" class="edge">
+<title>18&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M800.02,-1079.7C799.36,-1071.98 798.56,-1062.71 797.82,-1054.11"/>
+<polygon fill="black" stroke="black" points="801.31,-1053.77 796.97,-1044.1 794.33,-1054.37 801.31,-1053.77"/>
+</g>
+<!-- 19 -->
+<g id="node12" class="node">
+<title>19</title>
+<polygon fill="none" stroke="black" points="360.5,-1692 118.5,-1692 118.5,-1656 360.5,-1656 360.5,-1692"/>
+<text text-anchor="middle" x="239.5" y="-1670.3" font-family="Times,serif" font-size="14.00">Constant((1, 768, 768), float32)</text>
+</g>
+<!-- 19&#45;&gt;20 -->
+<g id="edge11" class="edge">
+<title>19&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M239.5,-1655.7C239.5,-1647.98 239.5,-1638.71 239.5,-1630.11"/>
+<polygon fill="black" stroke="black" points="243,-1630.1 239.5,-1620.1 236,-1630.1 243,-1630.1"/>
+</g>
+<!-- 21 -->
+<g id="node14" class="node">
+<title>21</title>
+<polygon fill="none" stroke="black" points="347,-1548 0,-1548 0,-1512 347,-1512 347,-1548"/>
+<text text-anchor="middle" x="173.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 20&#45;&gt;21 -->
+<g id="edge12" class="edge">
+<title>20&#45;&gt;21</title>
+<path fill="none" stroke="black" d="M223.19,-1583.7C215.19,-1575.22 205.43,-1564.86 196.68,-1555.58"/>
+<polygon fill="black" stroke="black" points="199.03,-1552.98 189.63,-1548.1 193.94,-1557.78 199.03,-1552.98"/>
+</g>
+<!-- 23 -->
+<g id="node16" class="node">
+<title>23</title>
+<polygon fill="none" stroke="black" points="486,-1476 415,-1476 415,-1440 486,-1440 486,-1476"/>
+<text text-anchor="middle" x="450.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 21&#45;&gt;23 -->
+<g id="edge13" class="edge">
+<title>21&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M240.91,-1511.97C291.72,-1499.13 360.04,-1481.86 404.66,-1470.58"/>
+<polygon fill="black" stroke="black" points="405.8,-1473.91 414.64,-1468.06 404.09,-1467.12 405.8,-1473.91"/>
+</g>
+<!-- 22 -->
+<g id="node15" class="node">
+<title>22</title>
+<polygon fill="none" stroke="black" points="557.5,-1548 365.5,-1548 365.5,-1512 557.5,-1512 557.5,-1548"/>
+<text text-anchor="middle" x="461.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 22&#45;&gt;23 -->
+<g id="edge14" class="edge">
+<title>22&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M458.78,-1511.7C457.57,-1503.98 456.11,-1494.71 454.76,-1486.11"/>
+<polygon fill="black" stroke="black" points="458.2,-1485.44 453.19,-1476.1 451.28,-1486.53 458.2,-1485.44"/>
+</g>
+<!-- 24 -->
+<g id="node17" class="node">
+<title>24</title>
+<polygon fill="none" stroke="black" points="633,-1404 268,-1404 268,-1368 633,-1368 633,-1404"/>
+<text text-anchor="middle" x="450.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 23&#45;&gt;24 -->
+<g id="edge15" class="edge">
+<title>23&#45;&gt;24</title>
+<path fill="none" stroke="black" d="M450.5,-1439.7C450.5,-1431.98 450.5,-1422.71 450.5,-1414.11"/>
+<polygon fill="black" stroke="black" points="454,-1414.1 450.5,-1404.1 447,-1414.1 454,-1414.1"/>
+</g>
+<!-- 25 -->
+<g id="node18" class="node">
+<title>25</title>
+<polygon fill="none" stroke="black" points="566,-1332 337,-1332 337,-1296 566,-1296 566,-1332"/>
+<text text-anchor="middle" x="451.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 24&#45;&gt;25 -->
+<g id="edge16" class="edge">
+<title>24&#45;&gt;25</title>
+<path fill="none" stroke="black" d="M450.75,-1367.7C450.86,-1359.98 450.99,-1350.71 451.11,-1342.11"/>
+<polygon fill="black" stroke="black" points="454.61,-1342.15 451.26,-1332.1 447.61,-1342.05 454.61,-1342.15"/>
+</g>
+<!-- 26 -->
+<g id="node19" class="node">
+<title>26</title>
+<polygon fill="none" stroke="black" points="596,-1260 367,-1260 367,-1224 596,-1224 596,-1260"/>
+<text text-anchor="middle" x="481.5" y="-1238.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 1, 3, 2])</text>
+</g>
+<!-- 25&#45;&gt;26 -->
+<g id="edge17" class="edge">
+<title>25&#45;&gt;26</title>
+<path fill="none" stroke="black" d="M458.92,-1295.7C462.29,-1287.81 466.37,-1278.3 470.12,-1269.55"/>
+<polygon fill="black" stroke="black" points="473.45,-1270.67 474.17,-1260.1 467.01,-1267.92 473.45,-1270.67"/>
+</g>
+<!-- 27 -->
+<g id="node20" class="node">
+<title>27</title>
+<polygon fill="none" stroke="black" points="668,-1188 325,-1188 325,-1152 668,-1152 668,-1188"/>
+<text text-anchor="middle" x="496.5" y="-1166.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 64, 14], reverse=0)</text>
+</g>
+<!-- 26&#45;&gt;27 -->
+<g id="edge18" class="edge">
+<title>26&#45;&gt;27</title>
+<path fill="none" stroke="black" d="M485.21,-1223.7C486.86,-1215.98 488.85,-1206.71 490.69,-1198.11"/>
+<polygon fill="black" stroke="black" points="494.16,-1198.62 492.83,-1188.1 487.32,-1197.15 494.16,-1198.62"/>
+</g>
+<!-- 28 -->
+<g id="node21" class="node">
+<title>28</title>
+<polygon fill="none" stroke="black" points="610,-1116 399,-1116 399,-1080 610,-1080 610,-1116"/>
+<text text-anchor="middle" x="504.5" y="-1094.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 27&#45;&gt;28 -->
+<g id="edge19" class="edge">
+<title>27&#45;&gt;28</title>
+<path fill="none" stroke="black" d="M498.48,-1151.7C499.36,-1143.98 500.42,-1134.71 501.4,-1126.11"/>
+<polygon fill="black" stroke="black" points="504.89,-1126.44 502.55,-1116.1 497.93,-1125.64 504.89,-1126.44"/>
+</g>
+<!-- 28&#45;&gt;29 -->
+<g id="edge21" class="edge">
+<title>28&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M575.31,-1079.97C617.47,-1069.83 671.09,-1056.93 714.87,-1046.4"/>
+<polygon fill="black" stroke="black" points="715.72,-1049.79 724.62,-1044.05 714.08,-1042.99 715.72,-1049.79"/>
+</g>
+<!-- 30 -->
+<g id="node23" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="978,-972 613,-972 613,-936 978,-936 978,-972"/>
+<text text-anchor="middle" x="795.5" y="-950.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 14], reverse=0)</text>
+</g>
+<!-- 29&#45;&gt;30 -->
+<g id="edge22" class="edge">
+<title>29&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M795.5,-1007.7C795.5,-999.98 795.5,-990.71 795.5,-982.11"/>
+<polygon fill="black" stroke="black" points="799,-982.1 795.5,-972.1 792,-982.1 799,-982.1"/>
+</g>
+<!-- 32 -->
+<g id="node24" class="node">
+<title>32</title>
+<polygon fill="none" stroke="black" points="849,-900 742,-900 742,-864 849,-864 849,-900"/>
+<text text-anchor="middle" x="795.5" y="-878.3" font-family="Times,serif" font-size="14.00">divide(·, 8.0)</text>
+</g>
+<!-- 30&#45;&gt;32 -->
+<g id="edge23" class="edge">
+<title>30&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M795.5,-935.7C795.5,-927.98 795.5,-918.71 795.5,-910.11"/>
+<polygon fill="black" stroke="black" points="799,-910.1 795.5,-900.1 792,-910.1 799,-910.1"/>
+</g>
+<!-- 32&#45;&gt;33 -->
+<g id="edge24" class="edge">
+<title>32&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M849.24,-866.13C852.03,-865.4 854.8,-864.69 857.5,-864 920.04,-848.13 992.73,-831.47 1038.67,-821.16"/>
+<polygon fill="black" stroke="black" points="1039.63,-824.53 1048.62,-818.93 1038.1,-817.7 1039.63,-824.53"/>
+</g>
+<!-- 34 -->
+<g id="node26" class="node">
+<title>34</title>
+<polygon fill="none" stroke="black" points="1174.5,-756 1000.5,-756 1000.5,-720 1174.5,-720 1174.5,-756"/>
+<text text-anchor="middle" x="1087.5" y="-734.3" font-family="Times,serif" font-size="14.00">nn.softmax(·| axis=&#45;1)</text>
+</g>
+<!-- 33&#45;&gt;34 -->
+<g id="edge26" class="edge">
+<title>33&#45;&gt;34</title>
+<path fill="none" stroke="black" d="M1085.24,-791.7C1085.57,-783.98 1085.97,-774.71 1086.34,-766.11"/>
+<polygon fill="black" stroke="black" points="1089.84,-766.25 1086.77,-756.1 1082.84,-765.95 1089.84,-766.25"/>
+</g>
+<!-- 35 -->
+<g id="node27" class="node">
+<title>35</title>
+<polygon fill="none" stroke="black" points="1202,-684 1019,-684 1019,-648 1202,-648 1202,-684"/>
+<text text-anchor="middle" x="1110.5" y="-662.3" font-family="Times,serif" font-size="14.00">nn.dropout(·| rate=0.1)</text>
+</g>
+<!-- 34&#45;&gt;35 -->
+<g id="edge27" class="edge">
+<title>34&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M1093.19,-719.7C1095.75,-711.9 1098.83,-702.51 1101.68,-693.83"/>
+<polygon fill="black" stroke="black" points="1105.08,-694.7 1104.88,-684.1 1098.43,-692.51 1105.08,-694.7"/>
+</g>
+<!-- 36 -->
+<g id="node28" class="node">
+<title>36</title>
+<polygon fill="none" stroke="black" points="1206.5,-612 1038.5,-612 1038.5,-576 1206.5,-576 1206.5,-612"/>
+<text text-anchor="middle" x="1122.5" y="-590.3" font-family="Times,serif" font-size="14.00">TupleGetItem(idx=0)</text>
+</g>
+<!-- 35&#45;&gt;36 -->
+<g id="edge28" class="edge">
+<title>35&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M1113.47,-647.7C1114.79,-639.98 1116.38,-630.71 1117.85,-622.11"/>
+<polygon fill="black" stroke="black" points="1121.33,-622.55 1119.57,-612.1 1114.43,-621.37 1121.33,-622.55"/>
+</g>
+<!-- 37 -->
+<g id="node29" class="node">
+<title>37</title>
+<polygon fill="none" stroke="black" points="1300,-540 957,-540 957,-504 1300,-504 1300,-540"/>
+<text text-anchor="middle" x="1128.5" y="-518.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 14], reverse=0)</text>
+</g>
+<!-- 36&#45;&gt;37 -->
+<g id="edge29" class="edge">
+<title>36&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M1123.98,-575.7C1124.64,-567.98 1125.44,-558.71 1126.18,-550.11"/>
+<polygon fill="black" stroke="black" points="1129.67,-550.37 1127.03,-540.1 1122.69,-549.77 1129.67,-550.37"/>
+</g>
+<!-- 47 -->
+<g id="node39" class="node">
+<title>47</title>
+<polygon fill="none" stroke="black" points="1312,-468 1143,-468 1143,-432 1312,-432 1312,-468"/>
+<text text-anchor="middle" x="1227.5" y="-446.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 37&#45;&gt;47 -->
+<g id="edge39" class="edge">
+<title>37&#45;&gt;47</title>
+<path fill="none" stroke="black" d="M1152.72,-503.88C1165.42,-494.89 1181.16,-483.76 1194.92,-474.03"/>
+<polygon fill="black" stroke="black" points="1197.04,-476.82 1203.19,-468.19 1193,-471.11 1197.04,-476.82"/>
+</g>
+<!-- 38 -->
+<g id="node30" class="node">
+<title>38</title>
+<polygon fill="none" stroke="black" points="1380.5,-1692 1138.5,-1692 1138.5,-1656 1380.5,-1656 1380.5,-1692"/>
+<text text-anchor="middle" x="1259.5" y="-1670.3" font-family="Times,serif" font-size="14.00">Constant((1, 768, 768), float32)</text>
+</g>
+<!-- 38&#45;&gt;39 -->
+<g id="edge31" class="edge">
+<title>38&#45;&gt;39</title>
+<path fill="none" stroke="black" d="M1259.5,-1655.7C1259.5,-1647.98 1259.5,-1638.71 1259.5,-1630.11"/>
+<polygon fill="black" stroke="black" points="1263,-1630.1 1259.5,-1620.1 1256,-1630.1 1263,-1630.1"/>
+</g>
+<!-- 40 -->
+<g id="node32" class="node">
+<title>40</title>
+<polygon fill="none" stroke="black" points="1499,-1548 1152,-1548 1152,-1512 1499,-1512 1499,-1548"/>
+<text text-anchor="middle" x="1325.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 39&#45;&gt;40 -->
+<g id="edge32" class="edge">
+<title>39&#45;&gt;40</title>
+<path fill="none" stroke="black" d="M1275.81,-1583.7C1283.81,-1575.22 1293.57,-1564.86 1302.32,-1555.58"/>
+<polygon fill="black" stroke="black" points="1305.06,-1557.78 1309.37,-1548.1 1299.97,-1552.98 1305.06,-1557.78"/>
+</g>
+<!-- 42 -->
+<g id="node34" class="node">
+<title>42</title>
+<polygon fill="none" stroke="black" points="1366,-1476 1295,-1476 1295,-1440 1366,-1440 1366,-1476"/>
+<text text-anchor="middle" x="1330.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 40&#45;&gt;42 -->
+<g id="edge33" class="edge">
+<title>40&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1326.74,-1511.7C1327.29,-1503.98 1327.95,-1494.71 1328.56,-1486.11"/>
+<polygon fill="black" stroke="black" points="1332.06,-1486.33 1329.28,-1476.1 1325.07,-1485.83 1332.06,-1486.33"/>
+</g>
+<!-- 41 -->
+<g id="node33" class="node">
+<title>41</title>
+<polygon fill="none" stroke="black" points="1709.5,-1548 1517.5,-1548 1517.5,-1512 1709.5,-1512 1709.5,-1548"/>
+<text text-anchor="middle" x="1613.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 41&#45;&gt;42 -->
+<g id="edge34" class="edge">
+<title>41&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1544.63,-1511.97C1492.31,-1499.02 1421.8,-1481.58 1376.25,-1470.32"/>
+<polygon fill="black" stroke="black" points="1376.9,-1466.87 1366.36,-1467.87 1375.22,-1473.67 1376.9,-1466.87"/>
+</g>
+<!-- 43 -->
+<g id="node35" class="node">
+<title>43</title>
+<polygon fill="none" stroke="black" points="1513,-1404 1148,-1404 1148,-1368 1513,-1368 1513,-1404"/>
+<text text-anchor="middle" x="1330.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 42&#45;&gt;43 -->
+<g id="edge35" class="edge">
+<title>42&#45;&gt;43</title>
+<path fill="none" stroke="black" d="M1330.5,-1439.7C1330.5,-1431.98 1330.5,-1422.71 1330.5,-1414.11"/>
+<polygon fill="black" stroke="black" points="1334,-1414.1 1330.5,-1404.1 1327,-1414.1 1334,-1414.1"/>
+</g>
+<!-- 44 -->
+<g id="node36" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="1445,-1332 1216,-1332 1216,-1296 1445,-1296 1445,-1332"/>
+<text text-anchor="middle" x="1330.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 43&#45;&gt;44 -->
+<g id="edge36" class="edge">
+<title>43&#45;&gt;44</title>
+<path fill="none" stroke="black" d="M1330.5,-1367.7C1330.5,-1359.98 1330.5,-1350.71 1330.5,-1342.11"/>
+<polygon fill="black" stroke="black" points="1334,-1342.1 1330.5,-1332.1 1327,-1342.1 1334,-1342.1"/>
+</g>
+<!-- 45 -->
+<g id="node37" class="node">
+<title>45</title>
+<polygon fill="none" stroke="black" points="1502,-1260 1159,-1260 1159,-1224 1502,-1224 1502,-1260"/>
+<text text-anchor="middle" x="1330.5" y="-1238.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 44&#45;&gt;45 -->
+<g id="edge37" class="edge">
+<title>44&#45;&gt;45</title>
+<path fill="none" stroke="black" d="M1330.5,-1295.7C1330.5,-1287.98 1330.5,-1278.71 1330.5,-1270.11"/>
+<polygon fill="black" stroke="black" points="1334,-1270.1 1330.5,-1260.1 1327,-1270.1 1334,-1270.1"/>
+</g>
+<!-- 46 -->
+<g id="node38" class="node">
+<title>46</title>
+<polygon fill="none" stroke="black" points="1436,-1044 1225,-1044 1225,-1008 1436,-1008 1436,-1044"/>
+<text text-anchor="middle" x="1330.5" y="-1022.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 45&#45;&gt;46 -->
+<g id="edge38" class="edge">
+<title>45&#45;&gt;46</title>
+<path fill="none" stroke="black" d="M1330.5,-1223.85C1330.5,-1186.83 1330.5,-1099.18 1330.5,-1054.39"/>
+<polygon fill="black" stroke="black" points="1334,-1054.23 1330.5,-1044.23 1327,-1054.23 1334,-1054.23"/>
+</g>
+<!-- 46&#45;&gt;47 -->
+<g id="edge40" class="edge">
+<title>46&#45;&gt;47</title>
+<path fill="none" stroke="black" d="M1330.5,-1007.95C1330.5,-981.29 1330.5,-928.11 1330.5,-883 1330.5,-883 1330.5,-883 1330.5,-593 1330.5,-552.36 1332.93,-537.21 1309.5,-504 1300.77,-491.62 1288.14,-481.42 1275.4,-473.4"/>
+<polygon fill="black" stroke="black" points="1276.99,-470.27 1266.6,-468.19 1273.42,-476.3 1276.99,-470.27"/>
+</g>
+<!-- 48 -->
+<g id="node40" class="node">
+<title>48</title>
+<polygon fill="none" stroke="black" points="1410,-396 1045,-396 1045,-360 1410,-360 1410,-396"/>
+<text text-anchor="middle" x="1227.5" y="-374.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 64], reverse=0)</text>
+</g>
+<!-- 47&#45;&gt;48 -->
+<g id="edge41" class="edge">
+<title>47&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1227.5,-431.7C1227.5,-423.98 1227.5,-414.71 1227.5,-406.11"/>
+<polygon fill="black" stroke="black" points="1231,-406.1 1227.5,-396.1 1224,-406.1 1231,-406.1"/>
+</g>
+<!-- 49 -->
+<g id="node41" class="node">
+<title>49</title>
+<polygon fill="none" stroke="black" points="1342,-324 1113,-324 1113,-288 1342,-288 1342,-324"/>
+<text text-anchor="middle" x="1227.5" y="-302.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 48&#45;&gt;49 -->
+<g id="edge42" class="edge">
+<title>48&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M1227.5,-359.7C1227.5,-351.98 1227.5,-342.71 1227.5,-334.11"/>
+<polygon fill="black" stroke="black" points="1231,-334.1 1227.5,-324.1 1224,-334.1 1231,-334.1"/>
+</g>
+<!-- 50 -->
+<g id="node42" class="node">
+<title>50</title>
+<polygon fill="none" stroke="black" points="1260,-252 1195,-252 1195,-216 1260,-216 1260,-252"/>
+<text text-anchor="middle" x="1227.5" y="-230.3" font-family="Times,serif" font-size="14.00">copy(·)</text>
+</g>
+<!-- 49&#45;&gt;50 -->
+<g id="edge43" class="edge">
+<title>49&#45;&gt;50</title>
+<path fill="none" stroke="black" d="M1227.5,-287.7C1227.5,-279.98 1227.5,-270.71 1227.5,-262.11"/>
+<polygon fill="black" stroke="black" points="1231,-262.1 1227.5,-252.1 1224,-262.1 1231,-262.1"/>
+</g>
+<!-- 51 -->
+<g id="node43" class="node">
+<title>51</title>
+<polygon fill="none" stroke="black" points="1401,-180 1054,-180 1054,-144 1401,-144 1401,-180"/>
+<text text-anchor="middle" x="1227.5" y="-158.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 50&#45;&gt;51 -->
+<g id="edge44" class="edge">
+<title>50&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M1227.5,-215.7C1227.5,-207.98 1227.5,-198.71 1227.5,-190.11"/>
+<polygon fill="black" stroke="black" points="1231,-190.1 1227.5,-180.1 1224,-190.1 1231,-190.1"/>
+</g>
+<!-- 52 -->
+<g id="node44" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="1270.5,-108 1184.5,-108 1184.5,-72 1270.5,-72 1270.5,-108"/>
+<text text-anchor="middle" x="1227.5" y="-86.3" font-family="Times,serif" font-size="14.00">Tuple[...])</text>
+</g>
+<!-- 51&#45;&gt;52 -->
+<g id="edge45" class="edge">
+<title>51&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M1227.5,-143.7C1227.5,-135.98 1227.5,-126.71 1227.5,-118.11"/>
+<polygon fill="black" stroke="black" points="1231,-118.1 1227.5,-108.1 1224,-118.1 1231,-118.1"/>
+</g>
+<!-- 53 -->
+<g id="node45" class="node">
+<title>53</title>
+<polygon fill="none" stroke="black" points="1267.5,-36 1187.5,-36 1187.5,0 1267.5,0 1267.5,-36"/>
+<text text-anchor="middle" x="1227.5" y="-14.3" font-family="Times,serif" font-size="14.00">Function</text>
+</g>
+<!-- 52&#45;&gt;53 -->
+<g id="edge46" class="edge">
+<title>52&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1227.5,-71.7C1227.5,-63.98 1227.5,-54.71 1227.5,-46.11"/>
+<polygon fill="black" stroke="black" points="1231,-46.1 1227.5,-36.1 1224,-46.1 1231,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert-tvm_74_0.svg b/images/bert-pytorch/bert-tvm_74_0.svg
new file mode 100644
index 0000000..f7a2ace
--- /dev/null
+++ b/images/bert-pytorch/bert-tvm_74_0.svg
@@ -0,0 +1,547 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="2649pt" height="1844pt"
+ viewBox="0.00 0.00 2648.50 1844.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1840)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1840 2644.5,-1840 2644.5,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="1120.5" cy="-1818" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="1120.5" y="-1814.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 11 -->
+<g id="node3" class="node">
+<title>11</title>
+<polygon fill="none" stroke="black" points="1296.5,-1764 944.5,-1764 944.5,-1728 1296.5,-1728 1296.5,-1764"/>
+<text text-anchor="middle" x="1120.5" y="-1742.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;11 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;11</title>
+<path fill="none" stroke="black" d="M1120.5,-1799.7C1120.5,-1791.98 1120.5,-1782.71 1120.5,-1774.11"/>
+<polygon fill="black" stroke="black" points="1124,-1774.1 1120.5,-1764.1 1117,-1774.1 1124,-1774.1"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="1640.5" cy="-882" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1640.5" y="-878.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 40 -->
+<g id="node25" class="node">
+<title>40</title>
+<polygon fill="none" stroke="black" points="1676,-828 1605,-828 1605,-792 1676,-792 1676,-828"/>
+<text text-anchor="middle" x="1640.5" y="-806.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 1&#45;&gt;40 -->
+<g id="edge25" class="edge">
+<title>1&#45;&gt;40</title>
+<path fill="none" stroke="black" d="M1640.5,-863.7C1640.5,-855.98 1640.5,-846.71 1640.5,-838.11"/>
+<polygon fill="black" stroke="black" points="1644,-838.1 1640.5,-828.1 1637,-838.1 1644,-838.1"/>
+</g>
+<!-- 13 -->
+<g id="node5" class="node">
+<title>13</title>
+<polygon fill="none" stroke="black" points="1364,-1692 1195,-1692 1195,-1656 1364,-1656 1364,-1692"/>
+<text text-anchor="middle" x="1279.5" y="-1670.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 11&#45;&gt;13 -->
+<g id="edge2" class="edge">
+<title>11&#45;&gt;13</title>
+<path fill="none" stroke="black" d="M1159.4,-1727.88C1181.13,-1718.31 1208.4,-1706.3 1231.46,-1696.15"/>
+<polygon fill="black" stroke="black" points="1232.99,-1699.3 1240.73,-1692.07 1230.17,-1692.89 1232.99,-1699.3"/>
+</g>
+<!-- 12 -->
+<g id="node4" class="node">
+<title>12</title>
+<polygon fill="none" stroke="black" points="1565,-1764 1314,-1764 1314,-1728 1565,-1728 1565,-1764"/>
+<text text-anchor="middle" x="1439.5" y="-1742.3" font-family="Times,serif" font-size="14.00">Constant((1, 2304, 768), float32)</text>
+</g>
+<!-- 12&#45;&gt;13 -->
+<g id="edge3" class="edge">
+<title>12&#45;&gt;13</title>
+<path fill="none" stroke="black" d="M1400.36,-1727.88C1378.49,-1718.31 1351.04,-1706.3 1327.84,-1696.15"/>
+<polygon fill="black" stroke="black" points="1329.08,-1692.87 1318.51,-1692.07 1326.27,-1699.28 1329.08,-1692.87"/>
+</g>
+<!-- 17 -->
+<g id="node6" class="node">
+<title>17</title>
+<polygon fill="none" stroke="black" points="823,-1620 0,-1620 0,-1584 823,-1584 823,-1620"/>
+<text text-anchor="middle" x="411.5" y="-1598.3" font-family="Times,serif" font-size="14.00">strided_slice(·, [0 0 0], [ &#45;1 &#160;&#45;1 768], [1 1 1]| begin=[0, 0, 0], end=[&#45;1, &#45;1, 768], strides=[1, 1, 1], slice_mode=size)</text>
+</g>
+<!-- 13&#45;&gt;17 -->
+<g id="edge4" class="edge">
+<title>13&#45;&gt;17</title>
+<path fill="none" stroke="black" d="M1194.69,-1666.16C1065.57,-1655.75 815.4,-1635.57 632.43,-1620.82"/>
+<polygon fill="black" stroke="black" points="632.65,-1617.32 622.4,-1620.01 632.09,-1624.3 632.65,-1617.32"/>
+</g>
+<!-- 27 -->
+<g id="node13" class="node">
+<title>27</title>
+<polygon fill="none" stroke="black" points="1718,-1620 841,-1620 841,-1584 1718,-1584 1718,-1620"/>
+<text text-anchor="middle" x="1279.5" y="-1598.3" font-family="Times,serif" font-size="14.00">strided_slice(·, [ &#160;0 &#160;&#160;0 768], [ &#45;1 &#160;&#45;1 768], [1 1 1]| begin=[0, 0, 768], end=[&#45;1, &#45;1, 768], strides=[1, 1, 1], slice_mode=size)</text>
+</g>
+<!-- 13&#45;&gt;27 -->
+<g id="edge11" class="edge">
+<title>13&#45;&gt;27</title>
+<path fill="none" stroke="black" d="M1279.5,-1655.7C1279.5,-1647.98 1279.5,-1638.71 1279.5,-1630.11"/>
+<polygon fill="black" stroke="black" points="1283,-1630.1 1279.5,-1620.1 1276,-1630.1 1283,-1630.1"/>
+</g>
+<!-- 48 -->
+<g id="node30" class="node">
+<title>48</title>
+<polygon fill="none" stroke="black" points="2640.5,-1620 1736.5,-1620 1736.5,-1584 2640.5,-1584 2640.5,-1620"/>
+<text text-anchor="middle" x="2188.5" y="-1598.3" font-family="Times,serif" font-size="14.00">strided_slice(·, [ &#160;&#160;0 &#160;&#160;&#160;0 1536], [ &#45;1 &#160;&#45;1 768], [1 1 1]| begin=[0, 0, 1536], end=[&#45;1, &#45;1, 768], strides=[1, 1, 1], slice_mode=size)</text>
+</g>
+<!-- 13&#45;&gt;48 -->
+<g id="edge30" class="edge">
+<title>13&#45;&gt;48</title>
+<path fill="none" stroke="black" d="M1364.2,-1666.48C1498.28,-1656.15 1764.03,-1635.69 1957.65,-1620.78"/>
+<polygon fill="black" stroke="black" points="1957.98,-1624.26 1967.68,-1620 1957.44,-1617.28 1957.98,-1624.26"/>
+</g>
+<!-- 18 -->
+<g id="node7" class="node">
+<title>18</title>
+<polygon fill="none" stroke="black" points="631,-1548 284,-1548 284,-1512 631,-1512 631,-1548"/>
+<text text-anchor="middle" x="457.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 17&#45;&gt;18 -->
+<g id="edge5" class="edge">
+<title>17&#45;&gt;18</title>
+<path fill="none" stroke="black" d="M422.87,-1583.7C428.22,-1575.56 434.7,-1565.69 440.61,-1556.7"/>
+<polygon fill="black" stroke="black" points="443.69,-1558.38 446.26,-1548.1 437.84,-1554.54 443.69,-1558.38"/>
+</g>
+<!-- 20 -->
+<g id="node9" class="node">
+<title>20</title>
+<polygon fill="none" stroke="black" points="806,-1476 735,-1476 735,-1440 806,-1440 806,-1476"/>
+<text text-anchor="middle" x="770.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 18&#45;&gt;20 -->
+<g id="edge6" class="edge">
+<title>18&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M533.67,-1511.97C593.43,-1498.6 674.63,-1480.44 724.75,-1469.23"/>
+<polygon fill="black" stroke="black" points="725.65,-1472.62 734.65,-1467.02 724.12,-1465.79 725.65,-1472.62"/>
+</g>
+<!-- 19 -->
+<g id="node8" class="node">
+<title>19</title>
+<polygon fill="none" stroke="black" points="866.5,-1548 674.5,-1548 674.5,-1512 866.5,-1512 866.5,-1548"/>
+<text text-anchor="middle" x="770.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 19&#45;&gt;20 -->
+<g id="edge7" class="edge">
+<title>19&#45;&gt;20</title>
+<path fill="none" stroke="black" d="M770.5,-1511.7C770.5,-1503.98 770.5,-1494.71 770.5,-1486.11"/>
+<polygon fill="black" stroke="black" points="774,-1486.1 770.5,-1476.1 767,-1486.1 774,-1486.1"/>
+</g>
+<!-- 21 -->
+<g id="node10" class="node">
+<title>21</title>
+<polygon fill="none" stroke="black" points="1051,-1404 686,-1404 686,-1368 1051,-1368 1051,-1404"/>
+<text text-anchor="middle" x="868.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 20&#45;&gt;21 -->
+<g id="edge8" class="edge">
+<title>20&#45;&gt;21</title>
+<path fill="none" stroke="black" d="M794.47,-1439.88C807.05,-1430.89 822.63,-1419.76 836.25,-1410.03"/>
+<polygon fill="black" stroke="black" points="838.33,-1412.85 844.43,-1404.19 834.26,-1407.15 838.33,-1412.85"/>
+</g>
+<!-- 22 -->
+<g id="node11" class="node">
+<title>22</title>
+<polygon fill="none" stroke="black" points="1043,-1332 814,-1332 814,-1296 1043,-1296 1043,-1332"/>
+<text text-anchor="middle" x="928.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 21&#45;&gt;22 -->
+<g id="edge9" class="edge">
+<title>21&#45;&gt;22</title>
+<path fill="none" stroke="black" d="M883.33,-1367.7C890.52,-1359.3 899.3,-1349.07 907.19,-1339.86"/>
+<polygon fill="black" stroke="black" points="909.99,-1341.97 913.84,-1332.1 904.67,-1337.42 909.99,-1341.97"/>
+</g>
+<!-- 23 -->
+<g id="node12" class="node">
+<title>23</title>
+<polygon fill="none" stroke="black" points="1166,-1116 823,-1116 823,-1080 1166,-1080 1166,-1116"/>
+<text text-anchor="middle" x="994.5" y="-1094.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 22&#45;&gt;23 -->
+<g id="edge10" class="edge">
+<title>22&#45;&gt;23</title>
+<path fill="none" stroke="black" d="M933.79,-1295.85C945.25,-1258.68 972.47,-1170.44 986.23,-1125.82"/>
+<polygon fill="black" stroke="black" points="989.58,-1126.82 989.19,-1116.23 982.89,-1124.76 989.58,-1126.82"/>
+</g>
+<!-- 36 -->
+<g id="node22" class="node">
+<title>36</title>
+<polygon fill="none" stroke="black" points="1385,-1044 1216,-1044 1216,-1008 1385,-1008 1385,-1044"/>
+<text text-anchor="middle" x="1300.5" y="-1022.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 23&#45;&gt;36 -->
+<g id="edge20" class="edge">
+<title>23&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M1068.96,-1079.97C1113.38,-1069.8 1169.92,-1056.87 1216,-1046.33"/>
+<polygon fill="black" stroke="black" points="1217,-1049.69 1225.97,-1044.05 1215.44,-1042.87 1217,-1049.69"/>
+</g>
+<!-- 28 -->
+<g id="node14" class="node">
+<title>28</title>
+<polygon fill="none" stroke="black" points="1453,-1548 1106,-1548 1106,-1512 1453,-1512 1453,-1548"/>
+<text text-anchor="middle" x="1279.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 27&#45;&gt;28 -->
+<g id="edge12" class="edge">
+<title>27&#45;&gt;28</title>
+<path fill="none" stroke="black" d="M1279.5,-1583.7C1279.5,-1575.98 1279.5,-1566.71 1279.5,-1558.11"/>
+<polygon fill="black" stroke="black" points="1283,-1558.1 1279.5,-1548.1 1276,-1558.1 1283,-1558.1"/>
+</g>
+<!-- 30 -->
+<g id="node16" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="1336,-1476 1265,-1476 1265,-1440 1336,-1440 1336,-1476"/>
+<text text-anchor="middle" x="1300.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 28&#45;&gt;30 -->
+<g id="edge13" class="edge">
+<title>28&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M1284.69,-1511.7C1287.03,-1503.9 1289.85,-1494.51 1292.45,-1485.83"/>
+<polygon fill="black" stroke="black" points="1295.85,-1486.69 1295.37,-1476.1 1289.14,-1484.68 1295.85,-1486.69"/>
+</g>
+<!-- 29 -->
+<g id="node15" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="1663.5,-1548 1471.5,-1548 1471.5,-1512 1663.5,-1512 1663.5,-1548"/>
+<text text-anchor="middle" x="1567.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 29&#45;&gt;30 -->
+<g id="edge14" class="edge">
+<title>29&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M1502.53,-1511.97C1454.08,-1499.26 1389.12,-1482.23 1346.09,-1470.95"/>
+<polygon fill="black" stroke="black" points="1346.75,-1467.51 1336.19,-1468.36 1344.97,-1474.28 1346.75,-1467.51"/>
+</g>
+<!-- 31 -->
+<g id="node17" class="node">
+<title>31</title>
+<polygon fill="none" stroke="black" points="1483,-1404 1118,-1404 1118,-1368 1483,-1368 1483,-1404"/>
+<text text-anchor="middle" x="1300.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 30&#45;&gt;31 -->
+<g id="edge15" class="edge">
+<title>30&#45;&gt;31</title>
+<path fill="none" stroke="black" d="M1300.5,-1439.7C1300.5,-1431.98 1300.5,-1422.71 1300.5,-1414.11"/>
+<polygon fill="black" stroke="black" points="1304,-1414.1 1300.5,-1404.1 1297,-1414.1 1304,-1414.1"/>
+</g>
+<!-- 32 -->
+<g id="node18" class="node">
+<title>32</title>
+<polygon fill="none" stroke="black" points="1415,-1332 1186,-1332 1186,-1296 1415,-1296 1415,-1332"/>
+<text text-anchor="middle" x="1300.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 31&#45;&gt;32 -->
+<g id="edge16" class="edge">
+<title>31&#45;&gt;32</title>
+<path fill="none" stroke="black" d="M1300.5,-1367.7C1300.5,-1359.98 1300.5,-1350.71 1300.5,-1342.11"/>
+<polygon fill="black" stroke="black" points="1304,-1342.1 1300.5,-1332.1 1297,-1342.1 1304,-1342.1"/>
+</g>
+<!-- 33 -->
+<g id="node19" class="node">
+<title>33</title>
+<polygon fill="none" stroke="black" points="1415,-1260 1186,-1260 1186,-1224 1415,-1224 1415,-1260"/>
+<text text-anchor="middle" x="1300.5" y="-1238.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 1, 3, 2])</text>
+</g>
+<!-- 32&#45;&gt;33 -->
+<g id="edge17" class="edge">
+<title>32&#45;&gt;33</title>
+<path fill="none" stroke="black" d="M1300.5,-1295.7C1300.5,-1287.98 1300.5,-1278.71 1300.5,-1270.11"/>
+<polygon fill="black" stroke="black" points="1304,-1270.1 1300.5,-1260.1 1297,-1270.1 1304,-1270.1"/>
+</g>
+<!-- 34 -->
+<g id="node20" class="node">
+<title>34</title>
+<polygon fill="none" stroke="black" points="1472,-1188 1129,-1188 1129,-1152 1472,-1152 1472,-1188"/>
+<text text-anchor="middle" x="1300.5" y="-1166.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 64, 14], reverse=0)</text>
+</g>
+<!-- 33&#45;&gt;34 -->
+<g id="edge18" class="edge">
+<title>33&#45;&gt;34</title>
+<path fill="none" stroke="black" d="M1300.5,-1223.7C1300.5,-1215.98 1300.5,-1206.71 1300.5,-1198.11"/>
+<polygon fill="black" stroke="black" points="1304,-1198.1 1300.5,-1188.1 1297,-1198.1 1304,-1198.1"/>
+</g>
+<!-- 35 -->
+<g id="node21" class="node">
+<title>35</title>
+<polygon fill="none" stroke="black" points="1406,-1116 1195,-1116 1195,-1080 1406,-1080 1406,-1116"/>
+<text text-anchor="middle" x="1300.5" y="-1094.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 34&#45;&gt;35 -->
+<g id="edge19" class="edge">
+<title>34&#45;&gt;35</title>
+<path fill="none" stroke="black" d="M1300.5,-1151.7C1300.5,-1143.98 1300.5,-1134.71 1300.5,-1126.11"/>
+<polygon fill="black" stroke="black" points="1304,-1126.1 1300.5,-1116.1 1297,-1126.1 1304,-1126.1"/>
+</g>
+<!-- 35&#45;&gt;36 -->
+<g id="edge21" class="edge">
+<title>35&#45;&gt;36</title>
+<path fill="none" stroke="black" d="M1300.5,-1079.7C1300.5,-1071.98 1300.5,-1062.71 1300.5,-1054.11"/>
+<polygon fill="black" stroke="black" points="1304,-1054.1 1300.5,-1044.1 1297,-1054.1 1304,-1054.1"/>
+</g>
+<!-- 37 -->
+<g id="node23" class="node">
+<title>37</title>
+<polygon fill="none" stroke="black" points="1504,-972 1139,-972 1139,-936 1504,-936 1504,-972"/>
+<text text-anchor="middle" x="1321.5" y="-950.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 14], reverse=0)</text>
+</g>
+<!-- 36&#45;&gt;37 -->
+<g id="edge22" class="edge">
+<title>36&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M1305.69,-1007.7C1308.03,-999.9 1310.85,-990.51 1313.45,-981.83"/>
+<polygon fill="black" stroke="black" points="1316.85,-982.69 1316.37,-972.1 1310.14,-980.68 1316.85,-982.69"/>
+</g>
+<!-- 39 -->
+<g id="node24" class="node">
+<title>39</title>
+<polygon fill="none" stroke="black" points="1395,-900 1288,-900 1288,-864 1395,-864 1395,-900"/>
+<text text-anchor="middle" x="1341.5" y="-878.3" font-family="Times,serif" font-size="14.00">divide(·, 8.0)</text>
+</g>
+<!-- 37&#45;&gt;39 -->
+<g id="edge23" class="edge">
+<title>37&#45;&gt;39</title>
+<path fill="none" stroke="black" d="M1326.44,-935.7C1328.67,-927.9 1331.35,-918.51 1333.83,-909.83"/>
+<polygon fill="black" stroke="black" points="1337.23,-910.68 1336.61,-900.1 1330.5,-908.76 1337.23,-910.68"/>
+</g>
+<!-- 39&#45;&gt;40 -->
+<g id="edge24" class="edge">
+<title>39&#45;&gt;40</title>
+<path fill="none" stroke="black" d="M1395.36,-868.27C1401.48,-866.83 1407.62,-865.38 1413.5,-864 1476.32,-849.25 1548.94,-832.31 1594.79,-821.64"/>
+<polygon fill="black" stroke="black" points="1595.77,-825 1604.72,-819.33 1594.18,-818.18 1595.77,-825"/>
+</g>
+<!-- 41 -->
+<g id="node26" class="node">
+<title>41</title>
+<polygon fill="none" stroke="black" points="1735.5,-756 1561.5,-756 1561.5,-720 1735.5,-720 1735.5,-756"/>
+<text text-anchor="middle" x="1648.5" y="-734.3" font-family="Times,serif" font-size="14.00">nn.softmax(·| axis=&#45;1)</text>
+</g>
+<!-- 40&#45;&gt;41 -->
+<g id="edge26" class="edge">
+<title>40&#45;&gt;41</title>
+<path fill="none" stroke="black" d="M1642.48,-791.7C1643.36,-783.98 1644.42,-774.71 1645.4,-766.11"/>
+<polygon fill="black" stroke="black" points="1648.89,-766.44 1646.55,-756.1 1641.93,-765.64 1648.89,-766.44"/>
+</g>
+<!-- 42 -->
+<g id="node27" class="node">
+<title>42</title>
+<polygon fill="none" stroke="black" points="1763,-684 1580,-684 1580,-648 1763,-648 1763,-684"/>
+<text text-anchor="middle" x="1671.5" y="-662.3" font-family="Times,serif" font-size="14.00">nn.dropout(·| rate=0.1)</text>
+</g>
+<!-- 41&#45;&gt;42 -->
+<g id="edge27" class="edge">
+<title>41&#45;&gt;42</title>
+<path fill="none" stroke="black" d="M1654.19,-719.7C1656.75,-711.9 1659.83,-702.51 1662.68,-693.83"/>
+<polygon fill="black" stroke="black" points="1666.08,-694.7 1665.88,-684.1 1659.43,-692.51 1666.08,-694.7"/>
+</g>
+<!-- 43 -->
+<g id="node28" class="node">
+<title>43</title>
+<polygon fill="none" stroke="black" points="1767.5,-612 1599.5,-612 1599.5,-576 1767.5,-576 1767.5,-612"/>
+<text text-anchor="middle" x="1683.5" y="-590.3" font-family="Times,serif" font-size="14.00">TupleGetItem(idx=0)</text>
+</g>
+<!-- 42&#45;&gt;43 -->
+<g id="edge28" class="edge">
+<title>42&#45;&gt;43</title>
+<path fill="none" stroke="black" d="M1674.47,-647.7C1675.79,-639.98 1677.38,-630.71 1678.85,-622.11"/>
+<polygon fill="black" stroke="black" points="1682.33,-622.55 1680.57,-612.1 1675.43,-621.37 1682.33,-622.55"/>
+</g>
+<!-- 44 -->
+<g id="node29" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="1861,-540 1518,-540 1518,-504 1861,-504 1861,-540"/>
+<text text-anchor="middle" x="1689.5" y="-518.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 14], reverse=0)</text>
+</g>
+<!-- 43&#45;&gt;44 -->
+<g id="edge29" class="edge">
+<title>43&#45;&gt;44</title>
+<path fill="none" stroke="black" d="M1684.98,-575.7C1685.64,-567.98 1686.44,-558.71 1687.18,-550.11"/>
+<polygon fill="black" stroke="black" points="1690.67,-550.37 1688.03,-540.1 1683.69,-549.77 1690.67,-550.37"/>
+</g>
+<!-- 56 -->
+<g id="node38" class="node">
+<title>56</title>
+<polygon fill="none" stroke="black" points="1873,-468 1704,-468 1704,-432 1873,-432 1873,-468"/>
+<text text-anchor="middle" x="1788.5" y="-446.3" font-family="Times,serif" font-size="14.00">nn.batch_matmul(·, ·)</text>
+</g>
+<!-- 44&#45;&gt;56 -->
+<g id="edge38" class="edge">
+<title>44&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1713.72,-503.88C1726.42,-494.89 1742.16,-483.76 1755.92,-474.03"/>
+<polygon fill="black" stroke="black" points="1758.04,-476.82 1764.19,-468.19 1754,-471.11 1758.04,-476.82"/>
+</g>
+<!-- 49 -->
+<g id="node31" class="node">
+<title>49</title>
+<polygon fill="none" stroke="black" points="2195,-1548 1848,-1548 1848,-1512 2195,-1512 2195,-1548"/>
+<text text-anchor="middle" x="2021.5" y="-1526.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 48&#45;&gt;49 -->
+<g id="edge31" class="edge">
+<title>48&#45;&gt;49</title>
+<path fill="none" stroke="black" d="M2147.65,-1583.88C2124.72,-1574.26 2095.92,-1562.19 2071.63,-1552.01"/>
+<polygon fill="black" stroke="black" points="2072.8,-1548.71 2062.22,-1548.07 2070.09,-1555.16 2072.8,-1548.71"/>
+</g>
+<!-- 51 -->
+<g id="node33" class="node">
+<title>51</title>
+<polygon fill="none" stroke="black" points="2057,-1476 1986,-1476 1986,-1440 2057,-1440 2057,-1476"/>
+<text text-anchor="middle" x="2021.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 49&#45;&gt;51 -->
+<g id="edge32" class="edge">
+<title>49&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M2021.5,-1511.7C2021.5,-1503.98 2021.5,-1494.71 2021.5,-1486.11"/>
+<polygon fill="black" stroke="black" points="2025,-1486.1 2021.5,-1476.1 2018,-1486.1 2025,-1486.1"/>
+</g>
+<!-- 50 -->
+<g id="node32" class="node">
+<title>50</title>
+<polygon fill="none" stroke="black" points="2405.5,-1548 2213.5,-1548 2213.5,-1512 2405.5,-1512 2405.5,-1548"/>
+<text text-anchor="middle" x="2309.5" y="-1526.3" font-family="Times,serif" font-size="14.00">Constant((768,), float32)</text>
+</g>
+<!-- 50&#45;&gt;51 -->
+<g id="edge33" class="edge">
+<title>50&#45;&gt;51</title>
+<path fill="none" stroke="black" d="M2239.42,-1511.97C2185.74,-1498.92 2113.28,-1481.31 2066.96,-1470.05"/>
+<polygon fill="black" stroke="black" points="2067.74,-1466.64 2057.2,-1467.68 2066.09,-1473.44 2067.74,-1466.64"/>
+</g>
+<!-- 52 -->
+<g id="node34" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="2162,-1404 1797,-1404 1797,-1368 2162,-1368 2162,-1404"/>
+<text text-anchor="middle" x="1979.5" y="-1382.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 12, 64], reverse=0)</text>
+</g>
+<!-- 51&#45;&gt;52 -->
+<g id="edge34" class="edge">
+<title>51&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M2011.12,-1439.7C2006.24,-1431.56 2000.31,-1421.69 1994.92,-1412.7"/>
+<polygon fill="black" stroke="black" points="1997.91,-1410.88 1989.76,-1404.1 1991.91,-1414.48 1997.91,-1410.88"/>
+</g>
+<!-- 53 -->
+<g id="node35" class="node">
+<title>53</title>
+<polygon fill="none" stroke="black" points="2094,-1332 1865,-1332 1865,-1296 2094,-1296 2094,-1332"/>
+<text text-anchor="middle" x="1979.5" y="-1310.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 52&#45;&gt;53 -->
+<g id="edge35" class="edge">
+<title>52&#45;&gt;53</title>
+<path fill="none" stroke="black" d="M1979.5,-1367.7C1979.5,-1359.98 1979.5,-1350.71 1979.5,-1342.11"/>
+<polygon fill="black" stroke="black" points="1983,-1342.1 1979.5,-1332.1 1976,-1342.1 1983,-1342.1"/>
+</g>
+<!-- 54 -->
+<g id="node36" class="node">
+<title>54</title>
+<polygon fill="none" stroke="black" points="2150,-1260 1807,-1260 1807,-1224 2150,-1224 2150,-1260"/>
+<text text-anchor="middle" x="1978.5" y="-1238.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[&#45;1, 14, 64], reverse=0)</text>
+</g>
+<!-- 53&#45;&gt;54 -->
+<g id="edge36" class="edge">
+<title>53&#45;&gt;54</title>
+<path fill="none" stroke="black" d="M1979.25,-1295.7C1979.14,-1287.98 1979.01,-1278.71 1978.89,-1270.11"/>
+<polygon fill="black" stroke="black" points="1982.39,-1270.05 1978.74,-1260.1 1975.39,-1270.15 1982.39,-1270.05"/>
+</g>
+<!-- 55 -->
+<g id="node37" class="node">
+<title>55</title>
+<polygon fill="none" stroke="black" points="2080,-1044 1869,-1044 1869,-1008 2080,-1008 2080,-1044"/>
+<text text-anchor="middle" x="1974.5" y="-1022.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1])</text>
+</g>
+<!-- 54&#45;&gt;55 -->
+<g id="edge37" class="edge">
+<title>54&#45;&gt;55</title>
+<path fill="none" stroke="black" d="M1978.18,-1223.85C1977.49,-1186.83 1975.85,-1099.18 1975.01,-1054.39"/>
+<polygon fill="black" stroke="black" points="1978.51,-1054.17 1974.82,-1044.23 1971.51,-1054.3 1978.51,-1054.17"/>
+</g>
+<!-- 55&#45;&gt;56 -->
+<g id="edge39" class="edge">
+<title>55&#45;&gt;56</title>
+<path fill="none" stroke="black" d="M1956.08,-1007.64C1931.63,-982.62 1891.5,-933.83 1891.5,-883 1891.5,-883 1891.5,-883 1891.5,-593 1891.5,-552.36 1893.93,-537.21 1870.5,-504 1861.77,-491.62 1849.14,-481.42 1836.4,-473.4"/>
+<polygon fill="black" stroke="black" points="1837.99,-470.27 1827.6,-468.19 1834.42,-476.3 1837.99,-470.27"/>
+</g>
+<!-- 57 -->
+<g id="node39" class="node">
+<title>57</title>
+<polygon fill="none" stroke="black" points="1971,-396 1606,-396 1606,-360 1971,-360 1971,-396"/>
+<text text-anchor="middle" x="1788.5" y="-374.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 12, 14, 64], reverse=0)</text>
+</g>
+<!-- 56&#45;&gt;57 -->
+<g id="edge40" class="edge">
+<title>56&#45;&gt;57</title>
+<path fill="none" stroke="black" d="M1788.5,-431.7C1788.5,-423.98 1788.5,-414.71 1788.5,-406.11"/>
+<polygon fill="black" stroke="black" points="1792,-406.1 1788.5,-396.1 1785,-406.1 1792,-406.1"/>
+</g>
+<!-- 58 -->
+<g id="node40" class="node">
+<title>58</title>
+<polygon fill="none" stroke="black" points="1903,-324 1674,-324 1674,-288 1903,-288 1903,-324"/>
+<text text-anchor="middle" x="1788.5" y="-302.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[0, 2, 1, 3])</text>
+</g>
+<!-- 57&#45;&gt;58 -->
+<g id="edge41" class="edge">
+<title>57&#45;&gt;58</title>
+<path fill="none" stroke="black" d="M1788.5,-359.7C1788.5,-351.98 1788.5,-342.71 1788.5,-334.11"/>
+<polygon fill="black" stroke="black" points="1792,-334.1 1788.5,-324.1 1785,-334.1 1792,-334.1"/>
+</g>
+<!-- 59 -->
+<g id="node41" class="node">
+<title>59</title>
+<polygon fill="none" stroke="black" points="1821,-252 1756,-252 1756,-216 1821,-216 1821,-252"/>
+<text text-anchor="middle" x="1788.5" y="-230.3" font-family="Times,serif" font-size="14.00">copy(·)</text>
+</g>
+<!-- 58&#45;&gt;59 -->
+<g id="edge42" class="edge">
+<title>58&#45;&gt;59</title>
+<path fill="none" stroke="black" d="M1788.5,-287.7C1788.5,-279.98 1788.5,-270.71 1788.5,-262.11"/>
+<polygon fill="black" stroke="black" points="1792,-262.1 1788.5,-252.1 1785,-262.1 1792,-262.1"/>
+</g>
+<!-- 60 -->
+<g id="node42" class="node">
+<title>60</title>
+<polygon fill="none" stroke="black" points="1962,-180 1615,-180 1615,-144 1962,-144 1962,-180"/>
+<text text-anchor="middle" x="1788.5" y="-158.3" font-family="Times,serif" font-size="14.00">reshape(·| newshape=[1, 14, 768], reverse=0)</text>
+</g>
+<!-- 59&#45;&gt;60 -->
+<g id="edge43" class="edge">
+<title>59&#45;&gt;60</title>
+<path fill="none" stroke="black" d="M1788.5,-215.7C1788.5,-207.98 1788.5,-198.71 1788.5,-190.11"/>
+<polygon fill="black" stroke="black" points="1792,-190.1 1788.5,-180.1 1785,-190.1 1792,-190.1"/>
+</g>
+<!-- 61 -->
+<g id="node43" class="node">
+<title>61</title>
+<polygon fill="none" stroke="black" points="1831.5,-108 1745.5,-108 1745.5,-72 1831.5,-72 1831.5,-108"/>
+<text text-anchor="middle" x="1788.5" y="-86.3" font-family="Times,serif" font-size="14.00">Tuple[...])</text>
+</g>
+<!-- 60&#45;&gt;61 -->
+<g id="edge44" class="edge">
+<title>60&#45;&gt;61</title>
+<path fill="none" stroke="black" d="M1788.5,-143.7C1788.5,-135.98 1788.5,-126.71 1788.5,-118.11"/>
+<polygon fill="black" stroke="black" points="1792,-118.1 1788.5,-108.1 1785,-118.1 1792,-118.1"/>
+</g>
+<!-- 62 -->
+<g id="node44" class="node">
+<title>62</title>
+<polygon fill="none" stroke="black" points="1828.5,-36 1748.5,-36 1748.5,0 1828.5,0 1828.5,-36"/>
+<text text-anchor="middle" x="1788.5" y="-14.3" font-family="Times,serif" font-size="14.00">Function</text>
+</g>
+<!-- 61&#45;&gt;62 -->
+<g id="edge45" class="edge">
+<title>61&#45;&gt;62</title>
+<path fill="none" stroke="black" d="M1788.5,-71.7C1788.5,-63.98 1788.5,-54.71 1788.5,-46.11"/>
+<polygon fill="black" stroke="black" points="1792,-46.1 1788.5,-36.1 1785,-46.1 1792,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert_layer.svg b/images/bert-pytorch/bert_layer.svg
new file mode 100644
index 0000000..3fca855
--- /dev/null
+++ b/images/bert-pytorch/bert_layer.svg
@@ -0,0 +1,234 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="1433pt" height="793pt"
+ viewBox="0.00 0.00 1432.74 793.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 789)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-789 1428.74,-789 1428.74,4 -4,4"/>
+<text text-anchor="middle" x="712.37" y="-769.8" font-family="Times,serif" font-size="14.00">BertLayer</text>
+<g id="clust1" class="cluster">
+<title>cluster_.9</title>
+<polygon fill="none" stroke="black" points="63,-208 63,-718 1215,-718 1215,-208 63,-208"/>
+<text text-anchor="middle" x="639" y="-702.8" font-family="Times,serif" font-size="14.00">attention (BertAttention)</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_attention..7</title>
+<polygon fill="none" stroke="black" points="83,-352 83,-643 907,-643 907,-352 83,-352"/>
+<text text-anchor="middle" x="495" y="-627.8" font-family="Times,serif" font-size="14.00">attention.self (BertSelfAttention)</text>
+</g>
+<!-- inp_1 -->
+<g id="node1" class="node">
+<title>inp_1</title>
+<ellipse fill="none" stroke="black" cx="172" cy="-744" rx="36" ry="18"/>
+<text text-anchor="middle" x="172" y="-740.3" font-family="Times,serif" font-size="14.00">inp_1</text>
+</g>
+<!-- attention.inp_1 -->
+<g id="node3" class="node">
+<title>attention.inp_1</title>
+<ellipse fill="none" stroke="black" cx="172" cy="-669" rx="81.49" ry="18"/>
+<text text-anchor="middle" x="172" y="-665.3" font-family="Times,serif" font-size="14.00">attention.inp_1</text>
+</g>
+<!-- inp_1&#45;&gt;attention.inp_1 -->
+<g id="edge1" class="edge">
+<title>inp_1&#45;&gt;attention.inp_1</title>
+<path fill="none" stroke="black" d="M172,-725.7C172,-717.25 172,-706.87 172,-697.37"/>
+<polygon fill="black" stroke="black" points="175.5,-697.18 172,-687.18 168.5,-697.18 175.5,-697.18"/>
+</g>
+<!-- inp_attention_mask -->
+<g id="node2" class="node">
+<title>inp_attention_mask</title>
+<ellipse fill="none" stroke="black" cx="1324" cy="-669" rx="100.98" ry="18"/>
+<text text-anchor="middle" x="1324" y="-665.3" font-family="Times,serif" font-size="14.00">inp_attention_mask</text>
+</g>
+<!-- attention.inp_attention_mask -->
+<g id="node4" class="node">
+<title>attention.inp_attention_mask</title>
+<ellipse fill="none" stroke="black" cx="1061" cy="-594" rx="146.47" ry="18"/>
+<text text-anchor="middle" x="1061" y="-590.3" font-family="Times,serif" font-size="14.00">attention.inp_attention_mask</text>
+</g>
+<!-- inp_attention_mask&#45;&gt;attention.inp_attention_mask -->
+<g id="edge2" class="edge">
+<title>inp_attention_mask&#45;&gt;attention.inp_attention_mask</title>
+<path fill="none" stroke="black" d="M1272.04,-653.58C1230.48,-642.04 1171.94,-625.79 1127.05,-613.33"/>
+<polygon fill="black" stroke="black" points="1127.98,-609.96 1117.4,-610.66 1126.1,-616.7 1127.98,-609.96"/>
+</g>
+<!-- attention.self.inp_1 -->
+<g id="node5" class="node">
+<title>attention.self.inp_1</title>
+<ellipse fill="none" stroke="black" cx="206" cy="-594" rx="100.98" ry="18"/>
+<text text-anchor="middle" x="206" y="-590.3" font-family="Times,serif" font-size="14.00">attention.self.inp_1</text>
+</g>
+<!-- attention.inp_1&#45;&gt;attention.self.inp_1 -->
+<g id="edge3" class="edge">
+<title>attention.inp_1&#45;&gt;attention.self.inp_1</title>
+<path fill="none" stroke="black" d="M179.89,-651.07C184.02,-642.2 189.17,-631.13 193.81,-621.18"/>
+<polygon fill="black" stroke="black" points="196.99,-622.63 198.04,-612.09 190.64,-619.68 196.99,-622.63"/>
+</g>
+<!-- attention..8 -->
+<g id="node12" class="node">
+<title>attention..8</title>
+<polygon fill="none" stroke="black" points="329,-324 71,-324 71,-288 329,-288 329,-324"/>
+<text text-anchor="middle" x="200" y="-302.3" font-family="Times,serif" font-size="14.00">attention.output (BertSelfOutput)</text>
+</g>
+<!-- attention.inp_1&#45;&gt;attention..8 -->
+<g id="edge13" class="edge">
+<title>attention.inp_1&#45;&gt;attention..8</title>
+<path fill="none" stroke="black" d="M102.14,-659.76C93.24,-655.92 85.12,-650.52 79,-643 45.27,-601.57 72,-576.42 72,-523 72,-523 72,-523 72,-449 72,-405.78 53.07,-386.58 79,-352 86.3,-342.26 95.95,-334.63 106.62,-328.65"/>
+<polygon fill="black" stroke="black" points="108.45,-331.65 115.79,-324.01 105.29,-325.4 108.45,-331.65"/>
+</g>
+<!-- attention.self.inp_attention_mask -->
+<g id="node6" class="node">
+<title>attention.self.inp_attention_mask</title>
+<ellipse fill="none" stroke="black" cx="733" cy="-522" rx="165.97" ry="18"/>
+<text text-anchor="middle" x="733" y="-518.3" font-family="Times,serif" font-size="14.00">attention.self.inp_attention_mask</text>
+</g>
+<!-- attention.inp_attention_mask&#45;&gt;attention.self.inp_attention_mask -->
+<g id="edge4" class="edge">
+<title>attention.inp_attention_mask&#45;&gt;attention.self.inp_attention_mask</title>
+<path fill="none" stroke="black" d="M991.44,-578.15C939.66,-567.1 868.82,-551.99 814.4,-540.37"/>
+<polygon fill="black" stroke="black" points="814.95,-536.91 804.44,-538.25 813.49,-543.76 814.95,-536.91"/>
+</g>
+<!-- attention.self..111 -->
+<g id="node7" class="node">
+<title>attention.self..111</title>
+<polygon fill="none" stroke="black" points="329,-540 111,-540 111,-504 329,-504 329,-540"/>
+<text text-anchor="middle" x="220" y="-518.3" font-family="Times,serif" font-size="14.00">attention.self.query (Linear)</text>
+</g>
+<!-- attention.self.inp_1&#45;&gt;attention.self..111 -->
+<g id="edge5" class="edge">
+<title>attention.self.inp_1&#45;&gt;attention.self..111</title>
+<path fill="none" stroke="black" d="M209.46,-575.7C211,-567.98 212.86,-558.71 214.58,-550.11"/>
+<polygon fill="black" stroke="black" points="218.05,-550.6 216.58,-540.1 211.19,-549.22 218.05,-550.6"/>
+</g>
+<!-- attention.self..112 -->
+<g id="node8" class="node">
+<title>attention.self..112</title>
+<polygon fill="none" stroke="black" points="549,-540 347,-540 347,-504 549,-504 549,-540"/>
+<text text-anchor="middle" x="448" y="-518.3" font-family="Times,serif" font-size="14.00">attention.self.key (Linear)</text>
+</g>
+<!-- attention.self.inp_1&#45;&gt;attention.self..112 -->
+<g id="edge6" class="edge">
+<title>attention.self.inp_1&#45;&gt;attention.self..112</title>
+<path fill="none" stroke="black" d="M256.44,-578.41C292.06,-568.11 340.24,-554.17 379.36,-542.85"/>
+<polygon fill="black" stroke="black" points="380.47,-546.18 389.1,-540.04 378.52,-539.45 380.47,-546.18"/>
+</g>
+<!-- attention.self..113 -->
+<g id="node9" class="node">
+<title>attention.self..113</title>
+<polygon fill="none" stroke="black" points="306.5,-468 91.5,-468 91.5,-432 306.5,-432 306.5,-468"/>
+<text text-anchor="middle" x="199" y="-446.3" font-family="Times,serif" font-size="14.00">attention.self.value (Linear)</text>
+</g>
+<!-- attention.self.inp_1&#45;&gt;attention.self..113 -->
+<g id="edge7" class="edge">
+<title>attention.self.inp_1&#45;&gt;attention.self..113</title>
+<path fill="none" stroke="black" d="M152.56,-578.71C133.17,-570.67 113.28,-558.4 102,-540 93.64,-526.36 93.84,-517.76 102,-504 109.91,-490.66 122.49,-480.47 136,-472.75"/>
+<polygon fill="black" stroke="black" points="137.76,-475.79 144.98,-468.04 134.5,-469.59 137.76,-475.79"/>
+</g>
+<!-- attention.self..114 -->
+<g id="node10" class="node">
+<title>attention.self..114</title>
+<polygon fill="none" stroke="black" points="571,-468 325,-468 325,-432 571,-432 571,-468"/>
+<text text-anchor="middle" x="448" y="-446.3" font-family="Times,serif" font-size="14.00">attention.self.dropout (Dropout)</text>
+</g>
+<!-- attention.self.inp_attention_mask&#45;&gt;attention.self..114 -->
+<g id="edge9" class="edge">
+<title>attention.self.inp_attention_mask&#45;&gt;attention.self..114</title>
+<path fill="none" stroke="black" d="M669.4,-505.38C627.45,-495.08 572.19,-481.5 527.38,-470.5"/>
+<polygon fill="black" stroke="black" points="527.94,-467.03 517.4,-468.04 526.27,-473.83 527.94,-467.03"/>
+</g>
+<!-- attention.self..111&#45;&gt;attention.self..114 -->
+<g id="edge10" class="edge">
+<title>attention.self..111&#45;&gt;attention.self..114</title>
+<path fill="none" stroke="black" d="M275.48,-503.97C307.76,-494.06 348.61,-481.51 382.46,-471.12"/>
+<polygon fill="black" stroke="black" points="383.74,-474.39 392.27,-468.11 381.68,-467.7 383.74,-474.39"/>
+</g>
+<!-- attention.self..112&#45;&gt;attention.self..114 -->
+<g id="edge8" class="edge">
+<title>attention.self..112&#45;&gt;attention.self..114</title>
+<path fill="none" stroke="black" d="M448,-503.7C448,-495.98 448,-486.71 448,-478.11"/>
+<polygon fill="black" stroke="black" points="451.5,-478.1 448,-468.1 444.5,-478.1 451.5,-478.1"/>
+</g>
+<!-- attention.self.out_0 -->
+<g id="node11" class="node">
+<title>attention.self.out_0</title>
+<ellipse fill="none" stroke="black" cx="200" cy="-378" rx="100.98" ry="18"/>
+<text text-anchor="middle" x="200" y="-374.3" font-family="Times,serif" font-size="14.00">attention.self.out_0</text>
+</g>
+<!-- attention.self..113&#45;&gt;attention.self.out_0 -->
+<g id="edge11" class="edge">
+<title>attention.self..113&#45;&gt;attention.self.out_0</title>
+<path fill="none" stroke="black" d="M199.25,-431.7C199.36,-423.98 199.49,-414.71 199.61,-406.11"/>
+<polygon fill="black" stroke="black" points="203.11,-406.15 199.76,-396.1 196.11,-406.05 203.11,-406.15"/>
+</g>
+<!-- attention.self..114&#45;&gt;attention.self.out_0 -->
+<g id="edge12" class="edge">
+<title>attention.self..114&#45;&gt;attention.self.out_0</title>
+<path fill="none" stroke="black" d="M387.65,-431.97C349.35,-421.16 299.93,-407.21 261.43,-396.34"/>
+<polygon fill="black" stroke="black" points="261.96,-392.85 251.38,-393.5 260.06,-399.59 261.96,-392.85"/>
+</g>
+<!-- attention.self.out_0&#45;&gt;attention..8 -->
+<g id="edge14" class="edge">
+<title>attention.self.out_0&#45;&gt;attention..8</title>
+<path fill="none" stroke="black" d="M200,-359.7C200,-351.98 200,-342.71 200,-334.11"/>
+<polygon fill="black" stroke="black" points="203.5,-334.1 200,-324.1 196.5,-334.1 203.5,-334.1"/>
+</g>
+<!-- attention.out_0 -->
+<g id="node13" class="node">
+<title>attention.out_0</title>
+<ellipse fill="none" stroke="black" cx="200" cy="-234" rx="81.49" ry="18"/>
+<text text-anchor="middle" x="200" y="-230.3" font-family="Times,serif" font-size="14.00">attention.out_0</text>
+</g>
+<!-- attention..8&#45;&gt;attention.out_0 -->
+<g id="edge15" class="edge">
+<title>attention..8&#45;&gt;attention.out_0</title>
+<path fill="none" stroke="black" d="M200,-287.7C200,-279.98 200,-270.71 200,-262.11"/>
+<polygon fill="black" stroke="black" points="203.5,-262.1 200,-252.1 196.5,-262.1 203.5,-262.1"/>
+</g>
+<!-- .10 -->
+<g id="node14" class="node">
+<title>.10</title>
+<polygon fill="none" stroke="black" points="248,-180 0,-180 0,-144 248,-144 248,-180"/>
+<text text-anchor="middle" x="124" y="-158.3" font-family="Times,serif" font-size="14.00">intermediate (BertIntermediate)</text>
+</g>
+<!-- attention.out_0&#45;&gt;.10 -->
+<g id="edge16" class="edge">
+<title>attention.out_0&#45;&gt;.10</title>
+<path fill="none" stroke="black" d="M181.99,-216.41C172.45,-207.63 160.57,-196.68 150.04,-186.99"/>
+<polygon fill="black" stroke="black" points="152.4,-184.4 142.68,-180.2 147.66,-189.55 152.4,-184.4"/>
+</g>
+<!-- .11 -->
+<g id="node15" class="node">
+<title>.11</title>
+<polygon fill="none" stroke="black" points="280.5,-108 119.5,-108 119.5,-72 280.5,-72 280.5,-108"/>
+<text text-anchor="middle" x="200" y="-86.3" font-family="Times,serif" font-size="14.00">output (BertOutput)</text>
+</g>
+<!-- attention.out_0&#45;&gt;.11 -->
+<g id="edge18" class="edge">
+<title>attention.out_0&#45;&gt;.11</title>
+<path fill="none" stroke="black" d="M225.39,-216.83C237.38,-207.65 250.47,-194.99 257,-180 263.39,-165.33 263.39,-158.67 257,-144 252.07,-132.68 243.39,-122.68 234.27,-114.51"/>
+<polygon fill="black" stroke="black" points="236.5,-111.81 226.58,-108.09 232.01,-117.19 236.5,-111.81"/>
+</g>
+<!-- .10&#45;&gt;.11 -->
+<g id="edge17" class="edge">
+<title>.10&#45;&gt;.11</title>
+<path fill="none" stroke="black" d="M142.79,-143.7C152.17,-135.05 163.68,-124.45 173.91,-115.03"/>
+<polygon fill="black" stroke="black" points="176.45,-117.45 181.43,-108.1 171.7,-112.3 176.45,-117.45"/>
+</g>
+<!-- out_0 -->
+<g id="node16" class="node">
+<title>out_0</title>
+<ellipse fill="none" stroke="black" cx="200" cy="-18" rx="36.29" ry="18"/>
+<text text-anchor="middle" x="200" y="-14.3" font-family="Times,serif" font-size="14.00">out_0</text>
+</g>
+<!-- .11&#45;&gt;out_0 -->
+<g id="edge19" class="edge">
+<title>.11&#45;&gt;out_0</title>
+<path fill="none" stroke="black" d="M200,-71.7C200,-63.98 200,-54.71 200,-46.11"/>
+<polygon fill="black" stroke="black" points="203.5,-46.1 200,-36.1 196.5,-46.1 203.5,-46.1"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/bert_model.svg b/images/bert-pytorch/bert_model.svg
new file mode 100644
index 0000000..0a60e2c
--- /dev/null
+++ b/images/bert-pytorch/bert_model.svg
@@ -0,0 +1,325 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="526pt" height="1294pt"
+ viewBox="0.00 0.00 525.50 1294.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1290)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-1290 521.5,-1290 521.5,4 -4,4"/>
+<text text-anchor="middle" x="258.75" y="-1270.8" font-family="Times,serif" font-size="14.00">BertModel</text>
+<g id="clust1" class="cluster">
+<title>cluster_.3358</title>
+<polygon fill="none" stroke="black" points="34.5,-136 34.5,-1147 509.5,-1147 509.5,-136 34.5,-136"/>
+<text text-anchor="middle" x="272" y="-1131.8" font-family="Times,serif" font-size="14.00">encoder (BertEncoder)</text>
+</g>
+<!-- inp_input_ids -->
+<g id="node1" class="node">
+<title>inp_input_ids</title>
+<ellipse fill="none" stroke="black" cx="119.5" cy="-1245" rx="72.29" ry="18"/>
+<text text-anchor="middle" x="119.5" y="-1241.3" font-family="Times,serif" font-size="14.00">inp_input_ids</text>
+</g>
+<!-- .3357 -->
+<g id="node3" class="node">
+<title>.3357</title>
+<polygon fill="none" stroke="black" points="239,-1191 0,-1191 0,-1155 239,-1155 239,-1191"/>
+<text text-anchor="middle" x="119.5" y="-1169.3" font-family="Times,serif" font-size="14.00">embeddings (BertEmbeddings)</text>
+</g>
+<!-- inp_input_ids&#45;&gt;.3357 -->
+<g id="edge1" class="edge">
+<title>inp_input_ids&#45;&gt;.3357</title>
+<path fill="none" stroke="black" d="M119.5,-1226.7C119.5,-1218.98 119.5,-1209.71 119.5,-1201.11"/>
+<polygon fill="black" stroke="black" points="123,-1201.1 119.5,-1191.1 116,-1201.1 123,-1201.1"/>
+</g>
+<!-- inp_attention_mask.1 -->
+<g id="node2" class="node">
+<title>inp_attention_mask.1</title>
+<ellipse fill="none" stroke="black" cx="366.5" cy="-1173" rx="109.68" ry="18"/>
+<text text-anchor="middle" x="366.5" y="-1169.3" font-family="Times,serif" font-size="14.00">inp_attention_mask.1</text>
+</g>
+<!-- encoder.inp_attention_mask -->
+<g id="node5" class="node">
+<title>encoder.inp_attention_mask</title>
+<ellipse fill="none" stroke="black" cx="361.5" cy="-1098" rx="139.98" ry="18"/>
+<text text-anchor="middle" x="361.5" y="-1094.3" font-family="Times,serif" font-size="14.00">encoder.inp_attention_mask</text>
+</g>
+<!-- inp_attention_mask.1&#45;&gt;encoder.inp_attention_mask -->
+<g id="edge3" class="edge">
+<title>inp_attention_mask.1&#45;&gt;encoder.inp_attention_mask</title>
+<path fill="none" stroke="black" d="M365.32,-1154.7C364.74,-1146.25 364.03,-1135.87 363.37,-1126.37"/>
+<polygon fill="black" stroke="black" points="366.85,-1125.91 362.68,-1116.18 359.87,-1126.39 366.85,-1125.91"/>
+</g>
+<!-- encoder.inp_26 -->
+<g id="node4" class="node">
+<title>encoder.inp_26</title>
+<ellipse fill="none" stroke="black" cx="123.5" cy="-1098" rx="80.69" ry="18"/>
+<text text-anchor="middle" x="123.5" y="-1094.3" font-family="Times,serif" font-size="14.00">encoder.inp_26</text>
+</g>
+<!-- .3357&#45;&gt;encoder.inp_26 -->
+<g id="edge2" class="edge">
+<title>.3357&#45;&gt;encoder.inp_26</title>
+<path fill="none" stroke="black" d="M120.45,-1154.7C120.91,-1146.25 121.48,-1135.87 122,-1126.37"/>
+<polygon fill="black" stroke="black" points="125.51,-1126.35 122.56,-1116.18 118.52,-1125.97 125.51,-1126.35"/>
+</g>
+<!-- encoder..39 -->
+<g id="node6" class="node">
+<title>encoder..39</title>
+<polygon fill="none" stroke="black" points="294.5,-1044 82.5,-1044 82.5,-1008 294.5,-1008 294.5,-1044"/>
+<text text-anchor="middle" x="188.5" y="-1022.3" font-family="Times,serif" font-size="14.00">encoder.layer.0 (BertLayer)</text>
+</g>
+<!-- encoder.inp_26&#45;&gt;encoder..39 -->
+<g id="edge4" class="edge">
+<title>encoder.inp_26&#45;&gt;encoder..39</title>
+<path fill="none" stroke="black" d="M139.23,-1080.05C147.1,-1071.58 156.77,-1061.17 165.46,-1051.82"/>
+<polygon fill="black" stroke="black" points="168.22,-1053.98 172.46,-1044.28 163.09,-1049.22 168.22,-1053.98"/>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..39 -->
+<g id="edge5" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..39</title>
+<path fill="none" stroke="black" d="M320.94,-1080.59C296.75,-1070.8 265.8,-1058.28 239.9,-1047.8"/>
+<polygon fill="black" stroke="black" points="241.2,-1044.55 230.61,-1044.04 238.57,-1051.04 241.2,-1044.55"/>
+</g>
+<!-- encoder..40 -->
+<g id="node7" class="node">
+<title>encoder..40</title>
+<polygon fill="none" stroke="black" points="334.5,-972 122.5,-972 122.5,-936 334.5,-936 334.5,-972"/>
+<text text-anchor="middle" x="228.5" y="-950.3" font-family="Times,serif" font-size="14.00">encoder.layer.1 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..40 -->
+<g id="edge7" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..40</title>
+<path fill="none" stroke="black" d="M356.56,-1079.85C350.3,-1060.53 337.91,-1029.08 318.5,-1008 307,-995.52 291.89,-985.08 277.33,-976.86"/>
+<polygon fill="black" stroke="black" points="278.84,-973.7 268.38,-972.03 275.52,-979.86 278.84,-973.7"/>
+</g>
+<!-- encoder..41 -->
+<g id="node8" class="node">
+<title>encoder..41</title>
+<polygon fill="none" stroke="black" points="354.5,-900 142.5,-900 142.5,-864 354.5,-864 354.5,-900"/>
+<text text-anchor="middle" x="248.5" y="-878.3" font-family="Times,serif" font-size="14.00">encoder.layer.2 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..41 -->
+<g id="edge9" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..41</title>
+<path fill="none" stroke="black" d="M364.36,-1079.79C368.62,-1048.34 373.24,-980.74 343.5,-936 334.75,-922.84 321.57,-912.62 307.8,-904.83"/>
+<polygon fill="black" stroke="black" points="309.18,-901.6 298.7,-900.05 305.92,-907.79 309.18,-901.6"/>
+</g>
+<!-- encoder..42 -->
+<g id="node9" class="node">
+<title>encoder..42</title>
+<polygon fill="none" stroke="black" points="374.5,-828 162.5,-828 162.5,-792 374.5,-792 374.5,-828"/>
+<text text-anchor="middle" x="268.5" y="-806.3" font-family="Times,serif" font-size="14.00">encoder.layer.3 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..42 -->
+<g id="edge11" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..42</title>
+<path fill="none" stroke="black" d="M367.32,-1079.88C380.1,-1039.01 406.2,-933.91 363.5,-864 355.42,-850.78 342.77,-840.59 329.3,-832.84"/>
+<polygon fill="black" stroke="black" points="330.84,-829.7 320.37,-828.1 327.56,-835.88 330.84,-829.7"/>
+</g>
+<!-- encoder..43 -->
+<g id="node10" class="node">
+<title>encoder..43</title>
+<polygon fill="none" stroke="black" points="384.5,-756 172.5,-756 172.5,-720 384.5,-720 384.5,-756"/>
+<text text-anchor="middle" x="278.5" y="-734.3" font-family="Times,serif" font-size="14.00">encoder.layer.4 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..43 -->
+<g id="edge13" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..43</title>
+<path fill="none" stroke="black" d="M370.57,-1079.82C375.51,-1069.67 381.26,-1056.42 384.5,-1044 394.53,-1005.53 393.5,-994.76 393.5,-955 393.5,-955 393.5,-955 393.5,-881 393.5,-841.2 406.22,-824.68 383.5,-792 374.14,-778.53 360.31,-768.3 345.74,-760.6"/>
+<polygon fill="black" stroke="black" points="347.13,-757.38 336.61,-756.12 344.05,-763.66 347.13,-757.38"/>
+</g>
+<!-- encoder..44 -->
+<g id="node11" class="node">
+<title>encoder..44</title>
+<polygon fill="none" stroke="black" points="384.5,-684 172.5,-684 172.5,-648 384.5,-648 384.5,-684"/>
+<text text-anchor="middle" x="278.5" y="-662.3" font-family="Times,serif" font-size="14.00">encoder.layer.5 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..44 -->
+<g id="edge15" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..44</title>
+<path fill="none" stroke="black" d="M373.2,-1080C379.68,-1069.92 387.43,-1056.67 392.5,-1044 407.59,-1006.26 413.5,-995.64 413.5,-955 413.5,-955 413.5,-955 413.5,-809 413.5,-768.46 418.99,-751.52 393.5,-720 382.46,-706.35 367.13,-696.06 351.25,-688.37"/>
+<polygon fill="black" stroke="black" points="352.42,-685.05 341.86,-684.13 349.53,-691.43 352.42,-685.05"/>
+</g>
+<!-- encoder..45 -->
+<g id="node12" class="node">
+<title>encoder..45</title>
+<polygon fill="none" stroke="black" points="294.5,-612 82.5,-612 82.5,-576 294.5,-576 294.5,-612"/>
+<text text-anchor="middle" x="188.5" y="-590.3" font-family="Times,serif" font-size="14.00">encoder.layer.6 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..45 -->
+<g id="edge16" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..45</title>
+<path fill="none" stroke="black" d="M345.42,-1079.8C336.69,-1069.86 326.15,-1056.82 318.5,-1044 309.62,-1029.12 316.96,-1018.92 303.5,-1008 236.76,-953.85 171.94,-1035.02 113.5,-972 86.43,-942.81 103.5,-922.8 103.5,-883 103.5,-883 103.5,-883 103.5,-737 103.5,-690.45 137.54,-645.95 162.54,-619.55"/>
+<polygon fill="black" stroke="black" points="165.16,-621.87 169.64,-612.27 160.15,-616.98 165.16,-621.87"/>
+</g>
+<!-- encoder..46 -->
+<g id="node13" class="node">
+<title>encoder..46</title>
+<polygon fill="none" stroke="black" points="284.5,-540 72.5,-540 72.5,-504 284.5,-504 284.5,-540"/>
+<text text-anchor="middle" x="178.5" y="-518.3" font-family="Times,serif" font-size="14.00">encoder.layer.7 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..46 -->
+<g id="edge19" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..46</title>
+<path fill="none" stroke="black" d="M264.82,-1084.97C186.39,-1074.27 86.72,-1058.18 73.5,-1044 46.36,-1014.88 63.5,-994.8 63.5,-955 63.5,-955 63.5,-955 63.5,-665 63.5,-625.2 50.78,-608.68 73.5,-576 82.86,-562.53 96.69,-552.3 111.26,-544.6"/>
+<polygon fill="black" stroke="black" points="112.95,-547.66 120.39,-540.12 109.87,-541.38 112.95,-547.66"/>
+</g>
+<!-- encoder..47 -->
+<g id="node14" class="node">
+<title>encoder..47</title>
+<polygon fill="none" stroke="black" points="284.5,-468 72.5,-468 72.5,-432 284.5,-432 284.5,-468"/>
+<text text-anchor="middle" x="178.5" y="-446.3" font-family="Times,serif" font-size="14.00">encoder.layer.8 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..47 -->
+<g id="edge21" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..47</title>
+<path fill="none" stroke="black" d="M260.15,-1085.54C176.34,-1075.09 68.82,-1059.02 54.5,-1044 27,-1015.15 43.5,-994.86 43.5,-955 43.5,-955 43.5,-955 43.5,-593 43.5,-552.46 38.01,-535.52 63.5,-504 74.54,-490.35 89.87,-480.06 105.75,-472.37"/>
+<polygon fill="black" stroke="black" points="107.47,-475.43 115.14,-468.13 104.58,-469.05 107.47,-475.43"/>
+</g>
+<!-- encoder..48 -->
+<g id="node15" class="node">
+<title>encoder..48</title>
+<polygon fill="none" stroke="black" points="434.5,-396 222.5,-396 222.5,-360 434.5,-360 434.5,-396"/>
+<text text-anchor="middle" x="328.5" y="-374.3" font-family="Times,serif" font-size="14.00">encoder.layer.9 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..48 -->
+<g id="edge23" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..48</title>
+<path fill="none" stroke="black" d="M377.53,-1079.94C386.15,-1070.04 396.44,-1057 403.5,-1044 423.43,-1007.32 433.5,-996.74 433.5,-955 433.5,-955 433.5,-955 433.5,-521 433.5,-471.46 391.68,-427.9 360.82,-402.49"/>
+<polygon fill="black" stroke="black" points="362.85,-399.63 352.85,-396.13 358.48,-405.1 362.85,-399.63"/>
+</g>
+<!-- encoder..49 -->
+<g id="node16" class="node">
+<title>encoder..49</title>
+<polygon fill="none" stroke="black" points="454,-324 233,-324 233,-288 454,-288 454,-324"/>
+<text text-anchor="middle" x="343.5" y="-302.3" font-family="Times,serif" font-size="14.00">encoder.layer.10 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..49 -->
+<g id="edge24" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..49</title>
+<path fill="none" stroke="black" d="M383.35,-1080.2C394.72,-1070.58 408.15,-1057.71 417.5,-1044 441.54,-1008.75 453.5,-997.67 453.5,-955 453.5,-955 453.5,-955 453.5,-449 453.5,-409.2 465.9,-392.9 443.5,-360 434.39,-346.62 420.78,-336.36 406.52,-328.6"/>
+<polygon fill="black" stroke="black" points="408.09,-325.47 397.59,-324.08 404.94,-331.72 408.09,-325.47"/>
+</g>
+<!-- encoder..50 -->
+<g id="node17" class="node">
+<title>encoder..50</title>
+<polygon fill="none" stroke="black" points="478,-252 257,-252 257,-216 478,-216 478,-252"/>
+<text text-anchor="middle" x="367.5" y="-230.3" font-family="Times,serif" font-size="14.00">encoder.layer.11 (BertLayer)</text>
+</g>
+<!-- encoder.inp_attention_mask&#45;&gt;encoder..50 -->
+<g id="edge27" class="edge">
+<title>encoder.inp_attention_mask&#45;&gt;encoder..50</title>
+<path fill="none" stroke="black" d="M390.96,-1080.29C405.25,-1071.04 421.8,-1058.49 433.5,-1044 460.75,-1010.26 473.5,-998.37 473.5,-955 473.5,-955 473.5,-955 473.5,-377 473.5,-337.14 484.76,-321.06 462.5,-288 453.68,-274.89 440.47,-264.69 426.69,-256.89"/>
+<polygon fill="black" stroke="black" points="428.08,-253.67 417.6,-252.11 424.82,-259.86 428.08,-253.67"/>
+</g>
+<!-- encoder..39&#45;&gt;encoder..40 -->
+<g id="edge6" class="edge">
+<title>encoder..39&#45;&gt;encoder..40</title>
+<path fill="none" stroke="black" d="M198.39,-1007.7C202.99,-999.64 208.56,-989.89 213.65,-980.98"/>
+<polygon fill="black" stroke="black" points="216.8,-982.52 218.73,-972.1 210.73,-979.05 216.8,-982.52"/>
+</g>
+<!-- encoder..40&#45;&gt;encoder..41 -->
+<g id="edge8" class="edge">
+<title>encoder..40&#45;&gt;encoder..41</title>
+<path fill="none" stroke="black" d="M233.44,-935.7C235.67,-927.9 238.35,-918.51 240.83,-909.83"/>
+<polygon fill="black" stroke="black" points="244.23,-910.68 243.61,-900.1 237.5,-908.76 244.23,-910.68"/>
+</g>
+<!-- encoder..41&#45;&gt;encoder..42 -->
+<g id="edge10" class="edge">
+<title>encoder..41&#45;&gt;encoder..42</title>
+<path fill="none" stroke="black" d="M253.44,-863.7C255.67,-855.9 258.35,-846.51 260.83,-837.83"/>
+<polygon fill="black" stroke="black" points="264.23,-838.68 263.61,-828.1 257.5,-836.76 264.23,-838.68"/>
+</g>
+<!-- encoder..42&#45;&gt;encoder..43 -->
+<g id="edge12" class="edge">
+<title>encoder..42&#45;&gt;encoder..43</title>
+<path fill="none" stroke="black" d="M270.97,-791.7C272.07,-783.98 273.4,-774.71 274.63,-766.11"/>
+<polygon fill="black" stroke="black" points="278.11,-766.5 276.06,-756.1 271.18,-765.51 278.11,-766.5"/>
+</g>
+<!-- encoder..43&#45;&gt;encoder..44 -->
+<g id="edge14" class="edge">
+<title>encoder..43&#45;&gt;encoder..44</title>
+<path fill="none" stroke="black" d="M278.5,-719.7C278.5,-711.98 278.5,-702.71 278.5,-694.11"/>
+<polygon fill="black" stroke="black" points="282,-694.1 278.5,-684.1 275,-694.1 282,-694.1"/>
+</g>
+<!-- encoder..44&#45;&gt;encoder..45 -->
+<g id="edge17" class="edge">
+<title>encoder..44&#45;&gt;encoder..45</title>
+<path fill="none" stroke="black" d="M256.25,-647.7C244.92,-638.88 230.97,-628.03 218.68,-618.47"/>
+<polygon fill="black" stroke="black" points="220.53,-615.48 210.49,-612.1 216.24,-621.01 220.53,-615.48"/>
+</g>
+<!-- encoder..45&#45;&gt;encoder..46 -->
+<g id="edge18" class="edge">
+<title>encoder..45&#45;&gt;encoder..46</title>
+<path fill="none" stroke="black" d="M186.03,-575.7C184.93,-567.98 183.6,-558.71 182.37,-550.11"/>
+<polygon fill="black" stroke="black" points="185.82,-549.51 180.94,-540.1 178.89,-550.5 185.82,-549.51"/>
+</g>
+<!-- encoder..46&#45;&gt;encoder..47 -->
+<g id="edge20" class="edge">
+<title>encoder..46&#45;&gt;encoder..47</title>
+<path fill="none" stroke="black" d="M178.5,-503.7C178.5,-495.98 178.5,-486.71 178.5,-478.11"/>
+<polygon fill="black" stroke="black" points="182,-478.1 178.5,-468.1 175,-478.1 182,-478.1"/>
+</g>
+<!-- encoder..47&#45;&gt;encoder..48 -->
+<g id="edge22" class="edge">
+<title>encoder..47&#45;&gt;encoder..48</title>
+<path fill="none" stroke="black" d="M215.19,-431.88C235.52,-422.39 260.97,-410.51 282.6,-400.42"/>
+<polygon fill="black" stroke="black" points="284.34,-403.47 291.93,-396.07 281.38,-397.13 284.34,-403.47"/>
+</g>
+<!-- encoder..48&#45;&gt;encoder..49 -->
+<g id="edge25" class="edge">
+<title>encoder..48&#45;&gt;encoder..49</title>
+<path fill="none" stroke="black" d="M332.21,-359.7C333.86,-351.98 335.85,-342.71 337.69,-334.11"/>
+<polygon fill="black" stroke="black" points="341.16,-334.62 339.83,-324.1 334.32,-333.15 341.16,-334.62"/>
+</g>
+<!-- encoder..49&#45;&gt;encoder..50 -->
+<g id="edge26" class="edge">
+<title>encoder..49&#45;&gt;encoder..50</title>
+<path fill="none" stroke="black" d="M349.43,-287.7C352.11,-279.9 355.33,-270.51 358.3,-261.83"/>
+<polygon fill="black" stroke="black" points="361.7,-262.7 361.64,-252.1 355.08,-260.43 361.7,-262.7"/>
+</g>
+<!-- encoder.out_0 -->
+<g id="node18" class="node">
+<title>encoder.out_0</title>
+<ellipse fill="none" stroke="black" cx="367.5" cy="-162" rx="75.29" ry="18"/>
+<text text-anchor="middle" x="367.5" y="-158.3" font-family="Times,serif" font-size="14.00">encoder.out_0</text>
+</g>
+<!-- encoder..50&#45;&gt;encoder.out_0 -->
+<g id="edge28" class="edge">
+<title>encoder..50&#45;&gt;encoder.out_0</title>
+<path fill="none" stroke="black" d="M367.5,-215.7C367.5,-207.98 367.5,-198.71 367.5,-190.11"/>
+<polygon fill="black" stroke="black" points="371,-190.1 367.5,-180.1 364,-190.1 371,-190.1"/>
+</g>
+<!-- .3359 -->
+<g id="node19" class="node">
+<title>.3359</title>
+<polygon fill="none" stroke="black" points="391,-108 238,-108 238,-72 391,-72 391,-108"/>
+<text text-anchor="middle" x="314.5" y="-86.3" font-family="Times,serif" font-size="14.00">pooler (BertPooler)</text>
+</g>
+<!-- encoder.out_0&#45;&gt;.3359 -->
+<g id="edge29" class="edge">
+<title>encoder.out_0&#45;&gt;.3359</title>
+<path fill="none" stroke="black" d="M354.67,-144.05C348.38,-135.75 340.68,-125.58 333.71,-116.38"/>
+<polygon fill="black" stroke="black" points="336.41,-114.14 327.58,-108.28 330.83,-118.36 336.41,-114.14"/>
+</g>
+<!-- out_0 -->
+<g id="node20" class="node">
+<title>out_0</title>
+<ellipse fill="none" stroke="black" cx="366.5" cy="-18" rx="36.29" ry="18"/>
+<text text-anchor="middle" x="366.5" y="-14.3" font-family="Times,serif" font-size="14.00">out_0</text>
+</g>
+<!-- encoder.out_0&#45;&gt;out_0 -->
+<g id="edge31" class="edge">
+<title>encoder.out_0&#45;&gt;out_0</title>
+<path fill="none" stroke="black" d="M381.48,-144.04C388.69,-134.18 396.75,-121.13 400.5,-108 404.9,-92.62 405,-87.35 400.5,-72 397.53,-61.88 391.97,-51.86 386.2,-43.3"/>
+<polygon fill="black" stroke="black" points="388.88,-41.04 380.2,-34.97 383.2,-45.13 388.88,-41.04"/>
+</g>
+<!-- .3359&#45;&gt;out_0 -->
+<g id="edge30" class="edge">
+<title>.3359&#45;&gt;out_0</title>
+<path fill="none" stroke="black" d="M327.35,-71.7C333.71,-63.14 341.49,-52.67 348.43,-43.33"/>
+<polygon fill="black" stroke="black" points="351.35,-45.26 354.51,-35.14 345.73,-41.08 351.35,-45.26"/>
+</g>
+</g>
+</svg>
diff --git a/images/bert-pytorch/pytorch-tvm-training_20_0.svg b/images/bert-pytorch/pytorch-tvm-training_20_0.svg
new file mode 100644
index 0000000..4521fb8
--- /dev/null
+++ b/images/bert-pytorch/pytorch-tvm-training_20_0.svg
@@ -0,0 +1,1237 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="3671pt" height="3716pt"
+ viewBox="0.00 0.00 3671.03 3716.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 3712)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-3712 3667.03,-3712 3667.03,4 -4,4"/>
+<!-- 0 -->
+<g id="node1" class="node">
+<title>0</title>
+<ellipse fill="none" stroke="black" cx="2012.5" cy="-3546" rx="170.87" ry="18"/>
+<text text-anchor="middle" x="2012.5" y="-3542.3" font-family="Times,serif" font-size="14.00">input: Tensor[(1, 14, 768), float32]</text>
+</g>
+<!-- 29 -->
+<g id="node19" class="node">
+<title>29</title>
+<polygon fill="none" stroke="black" points="2038.5,-3492 1590.5,-3492 1590.5,-3456 2038.5,-3456 2038.5,-3492"/>
+<text text-anchor="middle" x="1814.5" y="-3470.3" font-family="Times,serif" font-size="14.00">reshape(·, [ &#45;1 &#160;14 768]| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;29 -->
+<g id="edge1" class="edge">
+<title>0&#45;&gt;29</title>
+<path fill="none" stroke="black" d="M1966.08,-3528.59C1938.02,-3518.67 1902.03,-3505.95 1872.16,-3495.39"/>
+<polygon fill="black" stroke="black" points="1873.29,-3492.07 1862.7,-3492.04 1870.96,-3498.67 1873.29,-3492.07"/>
+</g>
+<!-- 44 -->
+<g id="node29" class="node">
+<title>44</title>
+<polygon fill="none" stroke="black" points="2504.5,-3492 2056.5,-3492 2056.5,-3456 2504.5,-3456 2504.5,-3492"/>
+<text text-anchor="middle" x="2280.5" y="-3470.3" font-family="Times,serif" font-size="14.00">reshape(·, [ &#45;1 &#160;14 768]| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;44 -->
+<g id="edge13" class="edge">
+<title>0&#45;&gt;44</title>
+<path fill="none" stroke="black" d="M2073.31,-3529.12C2112.49,-3518.88 2163.7,-3505.51 2205.41,-3494.61"/>
+<polygon fill="black" stroke="black" points="2206.45,-3497.96 2215.25,-3492.04 2204.69,-3491.19 2206.45,-3497.96"/>
+</g>
+<!-- 72 -->
+<g id="node49" class="node">
+<title>72</title>
+<polygon fill="none" stroke="black" points="1572.5,-3492 1124.5,-3492 1124.5,-3456 1572.5,-3456 1572.5,-3492"/>
+<text text-anchor="middle" x="1348.5" y="-3470.3" font-family="Times,serif" font-size="14.00">reshape(·, [ &#45;1 &#160;14 768]| newshape=[&#45;1, 14, 768], reverse=0)</text>
+</g>
+<!-- 0&#45;&gt;72 -->
+<g id="edge37" class="edge">
+<title>0&#45;&gt;72</title>
+<path fill="none" stroke="black" d="M1896.46,-3532.77C1792.43,-3521.8 1638.42,-3505.56 1520.15,-3493.1"/>
+<polygon fill="black" stroke="black" points="1520.21,-3489.58 1509.9,-3492.01 1519.48,-3496.54 1520.21,-3489.58"/>
+</g>
+<!-- 106 -->
+<g id="node74" class="node">
+<title>106</title>
+<polygon fill="none" stroke="black" points="2856,-1476 2785,-1476 2785,-1440 2856,-1440 2856,-1476"/>
+<text text-anchor="middle" x="2820.5" y="-1454.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 0&#45;&gt;106 -->
+<g id="edge68" class="edge">
+<title>0&#45;&gt;106</title>
+<path fill="none" stroke="black" d="M2161.62,-3537.11C2367.88,-3525.77 2717.95,-3504.82 2743.5,-3492 2790.25,-3468.55 2820.5,-3455.3 2820.5,-3403 2820.5,-3403 2820.5,-3403 2820.5,-1601 2820.5,-1561 2820.5,-1514.65 2820.5,-1486.08"/>
+<polygon fill="black" stroke="black" points="2824,-1486.05 2820.5,-1476.05 2817,-1486.05 2824,-1486.05"/>
+</g>
+<!-- 1 -->
+<g id="node2" class="node">
+<title>1</title>
+<ellipse fill="none" stroke="black" cx="1743.5" cy="-2682" rx="217.96" ry="18"/>
+<text text-anchor="middle" x="1743.5" y="-2678.3" font-family="Times,serif" font-size="14.00">attention_mask: Tensor[(1, 1, 1, 14), float32]</text>
+</g>
+<!-- 65 -->
+<g id="node44" class="node">
+<title>65</title>
+<polygon fill="none" stroke="black" points="2024,-2628 1953,-2628 1953,-2592 2024,-2592 2024,-2628"/>
+<text text-anchor="middle" x="1988.5" y="-2606.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 1&#45;&gt;65 -->
+<g id="edge32" class="edge">
+<title>1&#45;&gt;65</title>
+<path fill="none" stroke="black" d="M1800.94,-2664.59C1844.23,-2652.22 1902.79,-2635.49 1942.89,-2624.03"/>
+<polygon fill="black" stroke="black" points="1944.01,-2627.35 1952.67,-2621.24 1942.09,-2620.62 1944.01,-2627.35"/>
+</g>
+<!-- 2 -->
+<g id="node3" class="node">
+<title>2</title>
+<ellipse fill="none" stroke="black" cx="1000.5" cy="-3690" rx="265.65" ry="18"/>
+<text text-anchor="middle" x="1000.5" y="-3686.3" font-family="Times,serif" font-size="14.00">attention.self.query.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 30 -->
+<g id="node20" class="node">
+<title>30</title>
+<polygon fill="none" stroke="black" points="1097,-3636 904,-3636 904,-3600 1097,-3600 1097,-3636"/>
+<text text-anchor="middle" x="1000.5" y="-3614.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 2&#45;&gt;30 -->
+<g id="edge2" class="edge">
+<title>2&#45;&gt;30</title>
+<path fill="none" stroke="black" d="M1000.5,-3671.7C1000.5,-3663.98 1000.5,-3654.71 1000.5,-3646.11"/>
+<polygon fill="black" stroke="black" points="1004,-3646.1 1000.5,-3636.1 997,-3646.1 1004,-3646.1"/>
+</g>
+<!-- 3 -->
+<g id="node4" class="node">
+<title>3</title>
+<ellipse fill="none" stroke="black" cx="1637.5" cy="-3330" rx="232.86" ry="18"/>
+<text text-anchor="middle" x="1637.5" y="-3326.3" font-family="Times,serif" font-size="14.00">attention.self.query.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 37 -->
+<g id="node25" class="node">
+<title>37</title>
+<polygon fill="none" stroke="black" points="1673,-3276 1602,-3276 1602,-3240 1673,-3240 1673,-3276"/>
+<text text-anchor="middle" x="1637.5" y="-3254.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 3&#45;&gt;37 -->
+<g id="edge9" class="edge">
+<title>3&#45;&gt;37</title>
+<path fill="none" stroke="black" d="M1637.5,-3311.7C1637.5,-3303.98 1637.5,-3294.71 1637.5,-3286.11"/>
+<polygon fill="black" stroke="black" points="1641,-3286.1 1637.5,-3276.1 1634,-3286.1 1641,-3286.1"/>
+</g>
+<!-- 4 -->
+<g id="node5" class="node">
+<title>4</title>
+<ellipse fill="none" stroke="black" cx="2628.5" cy="-3690" rx="254.55" ry="18"/>
+<text text-anchor="middle" x="2628.5" y="-3686.3" font-family="Times,serif" font-size="14.00">attention.self.key.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 45 -->
+<g id="node30" class="node">
+<title>45</title>
+<polygon fill="none" stroke="black" points="2725,-3636 2532,-3636 2532,-3600 2725,-3600 2725,-3636"/>
+<text text-anchor="middle" x="2628.5" y="-3614.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 4&#45;&gt;45 -->
+<g id="edge14" class="edge">
+<title>4&#45;&gt;45</title>
+<path fill="none" stroke="black" d="M2628.5,-3671.7C2628.5,-3663.98 2628.5,-3654.71 2628.5,-3646.11"/>
+<polygon fill="black" stroke="black" points="2632,-3646.1 2628.5,-3636.1 2625,-3646.1 2632,-3646.1"/>
+</g>
+<!-- 5 -->
+<g id="node6" class="node">
+<title>5</title>
+<ellipse fill="none" stroke="black" cx="2109.5" cy="-3330" rx="221.76" ry="18"/>
+<text text-anchor="middle" x="2109.5" y="-3326.3" font-family="Times,serif" font-size="14.00">attention.self.key.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 52 -->
+<g id="node35" class="node">
+<title>52</title>
+<polygon fill="none" stroke="black" points="2145,-3276 2074,-3276 2074,-3240 2145,-3240 2145,-3276"/>
+<text text-anchor="middle" x="2109.5" y="-3254.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 5&#45;&gt;52 -->
+<g id="edge21" class="edge">
+<title>5&#45;&gt;52</title>
+<path fill="none" stroke="black" d="M2109.5,-3311.7C2109.5,-3303.98 2109.5,-3294.71 2109.5,-3286.11"/>
+<polygon fill="black" stroke="black" points="2113,-3286.1 2109.5,-3276.1 2106,-3286.1 2113,-3286.1"/>
+</g>
+<!-- 6 -->
+<g id="node7" class="node">
+<title>6</title>
+<ellipse fill="none" stroke="black" cx="336.5" cy="-3690" rx="265.35" ry="18"/>
+<text text-anchor="middle" x="336.5" y="-3686.3" font-family="Times,serif" font-size="14.00">attention.self.value.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 73 -->
+<g id="node50" class="node">
+<title>73</title>
+<polygon fill="none" stroke="black" points="433,-3636 240,-3636 240,-3600 433,-3600 433,-3636"/>
+<text text-anchor="middle" x="336.5" y="-3614.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 6&#45;&gt;73 -->
+<g id="edge38" class="edge">
+<title>6&#45;&gt;73</title>
+<path fill="none" stroke="black" d="M336.5,-3671.7C336.5,-3663.98 336.5,-3654.71 336.5,-3646.11"/>
+<polygon fill="black" stroke="black" points="340,-3646.1 336.5,-3636.1 333,-3646.1 340,-3646.1"/>
+</g>
+<!-- 7 -->
+<g id="node8" class="node">
+<title>7</title>
+<ellipse fill="none" stroke="black" cx="693.5" cy="-3330" rx="232.06" ry="18"/>
+<text text-anchor="middle" x="693.5" y="-3326.3" font-family="Times,serif" font-size="14.00">attention.self.value.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 80 -->
+<g id="node55" class="node">
+<title>80</title>
+<polygon fill="none" stroke="black" points="729,-3276 658,-3276 658,-3240 729,-3240 729,-3276"/>
+<text text-anchor="middle" x="693.5" y="-3254.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 7&#45;&gt;80 -->
+<g id="edge45" class="edge">
+<title>7&#45;&gt;80</title>
+<path fill="none" stroke="black" d="M693.5,-3311.7C693.5,-3303.98 693.5,-3294.71 693.5,-3286.11"/>
+<polygon fill="black" stroke="black" points="697,-3286.1 693.5,-3276.1 690,-3286.1 697,-3286.1"/>
+</g>
+<!-- 8 -->
+<g id="node9" class="node">
+<title>8</title>
+<ellipse fill="none" stroke="black" cx="1578.5" cy="-2106" rx="282.15" ry="18"/>
+<text text-anchor="middle" x="1578.5" y="-2102.3" font-family="Times,serif" font-size="14.00">attention.output.dense.weight: Tensor[(768, 768), float32]</text>
+</g>
+<!-- 96 -->
+<g id="node66" class="node">
+<title>96</title>
+<polygon fill="none" stroke="black" points="1675,-2052 1482,-2052 1482,-2016 1675,-2016 1675,-2052"/>
+<text text-anchor="middle" x="1578.5" y="-2030.3" font-family="Times,serif" font-size="14.00">transpose(·| axes=[1, 0])</text>
+</g>
+<!-- 8&#45;&gt;96 -->
+<g id="edge57" class="edge">
+<title>8&#45;&gt;96</title>
+<path fill="none" stroke="black" d="M1578.5,-2087.7C1578.5,-2079.98 1578.5,-2070.71 1578.5,-2062.11"/>
+<polygon fill="black" stroke="black" points="1582,-2062.1 1578.5,-2052.1 1575,-2062.1 1582,-2062.1"/>
+</g>
+<!-- 9 -->
+<g id="node10" class="node">
+<title>9</title>
+<ellipse fill="none" stroke="black" cx="2542.5" cy="-1746" rx="248.86" ry="18"/>
+<text text-anchor="middle" x="2542.5" y="-1742.3" font-family="Times,serif" font-size="14.00">attention.output.dense.bias: Tensor[(768,), float32]</text>
+</g>
+<!-- 103 -->
+<g id="node71" class="node">
+<title>103</title>
+<polygon fill="none" stroke="black" points="2578,-1692 2507,-1692 2507,-1656 2578,-1656 2578,-1692"/>
+<text text-anchor="middle" x="2542.5" y="-1670.3" font-family="Times,serif" font-size="14.00">add(·, ·)</text>
+</g>
+<!-- 9&#45;&gt;103 -->
+<g id="edge64" class="edge">
+<title>9&#45;&gt;103</title>
+<path fill="none" stroke="black" d="M2542.5,-1727.7C2542.5,-1719.98 2542.5,-1710.71 2542.5,-1702.11"/>
+<polygon fill="black" stroke="black" points="2546,-1702.1 2542.5,-1692.1 2539,-1702.1 2546,-1702.1"/>
+</g>
+<!-- 10 -->
+<g id="node11" class="node">
+<title>10</title>
+<ellipse fill="none" stroke="black" cx="1913.5" cy="-1458" rx="286.75" ry="18"/>
+<text text-anchor="middle" x="1913.5" y="-1454.3" font-family="Times,serif" font-size="14.00">attention.output.LayerNorm.weight: Tensor[(768,), float32]</text>
+</g>
+<!-- 107 -->
+<g id="node75" class="node">
+<title>107</title>
+<polygon fill="none" stroke="black" points="2739.5,-1404 2261.5,-1404 2261.5,-1368 2739.5,-1368 2739.5,-1404"/>
+<text text-anchor="middle" x="2500.5" y="-1382.3" font-family="Times,serif" font-size="14.00">nn.layer_norm(·, ·, ·| axis=&#45;1, epsilon=1e&#45;12, center=1, scale=1)</text>
+</g>
+<!-- 10&#45;&gt;107 -->
+<g id="edge70" class="edge">
+<title>10&#45;&gt;107</title>
+<path fill="none" stroke="black" d="M2040.51,-1441.85C2130.7,-1431.1 2252.23,-1416.61 2347.72,-1405.22"/>
+<polygon fill="black" stroke="black" points="2348.24,-1408.68 2357.76,-1404.02 2347.41,-1401.73 2348.24,-1408.68"/>
+</g>
+<!-- 11 -->
+<g id="node12" class="node">
+<title>11</title>
... 14482 lines suppressed ...