You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2020/07/14 16:12:16 UTC

[incubator-tvm-site] branch asf-site updated: Build at Tue Jul 14 09:12:04 PDT 2020

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-tvm-site.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 202b30a  Build at Tue Jul 14 09:12:04 PDT 2020
202b30a is described below

commit 202b30a45f2d83f44e07227e2523030d199c298c
Author: tqchen <tq...@gmail.com>
AuthorDate: Tue Jul 14 09:12:04 2020 -0700

    Build at Tue Jul 14 09:12:04 PDT 2020
---
 ...s-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html | 16 +++----
 2020/07/14/bert-pytorch-tvm.html                   | 32 ++++++-------
 atom.xml                                           | 50 ++++++++++-----------
 rss.xml                                            | 52 +++++++++++-----------
 4 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html b/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
index 07f0cb6..7d0db87 100644
--- a/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
+++ b/2017/10/30/Bringing-AMDGPUs-to-TVM-Stack-and-NNVM-Compiler-with-ROCm.html
@@ -262,13 +262,13 @@ We are starting to look at performance optimization and we expect more improveme
 <p>You should see something like this:</p>
 
 <figure class="highlight"><pre><code class="language-llvm" data-lang="llvm"><span class="c1">; ModuleID = 'myadd__kernel0'</span>
-<span class="err">source_filename</span> <span class="p">=</span> <span class="s">"myadd__kernel0"</span>
+<span class="err">sour</span><span class="k">c</span><span class="err">e_filename</span> <span class="p">=</span> <span class="s">"myadd__kernel0"</span>
 <span class="k">target</span> <span class="k">datalayout</span> <span class="p">=</span> <span class="s">"e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"</span>
 <span class="k">target</span> <span class="k">triple</span> <span class="p">=</span> <span class="s">"amdgcn-amd-amdhsa-hcc"</span>
 
 
 <span class="c1">; Function Attrs: nounwind</span>
-<span class="k">define</span> <span class="k">dllexport</span> <span class="err">amdgpu_kernel</span> <span class="kt">void</span> <span class="vg">@myadd__kernel0</span><span class="p">(</span><span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="k">noalias</span> <span class="k">nocapture</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class= [...]
+<span class="k">define</span> <span class="k">dllexport</span> <span class="err">amdgpu_ker</span><span class="k">ne</span><span class="err">l</span> <span class="kt">void</span> <span class="vg">@myadd__kernel0</span><span class="p">(</span><span class="kt">float</span> <span class="k">add</span><span class="err">rspa</span><span class="k">c</span><span class="err">e</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="k">noalias</span> <span clas [...]
 <span class="nl">entry:</span>
   <span class="nv">%4</span> <span class="p">=</span> <span class="k">tail</span> <span class="k">call</span> <span class="kt">i32</span> <span class="vg">@llvm.amdgcn.workgroup.id.x</span><span class="p">()</span>
   <span class="nv">%5</span> <span class="p">=</span> <span class="k">tail</span> <span class="k">call</span> <span class="kt">i32</span> <span class="vg">@llvm.amdgcn.workitem.id.x</span><span class="p">()</span>
@@ -288,14 +288,14 @@ We are starting to look at performance optimization and we expect more improveme
   <span class="nv">%10</span> <span class="p">=</span> <span class="k">add</span> <span class="k">nsw</span> <span class="kt">i32</span> <span class="nv">%.pre-phi</span><span class="p">,</span> <span class="nv">%5</span>
   <span class="nv">%11</span> <span class="p">=</span> <span class="k">add</span> <span class="k">nsw</span> <span class="kt">i32</span> <span class="nv">%.pre-phi</span><span class="p">,</span> <span class="nv">%5</span>
   <span class="nv">%12</span> <span class="p">=</span> <span class="k">sext</span> <span class="kt">i32</span> <span class="nv">%11</span> <span class="k">to</span> <span class="kt">i64</span>
-  <span class="nv">%13</span> <span class="p">=</span> <span class="k">getelementptr</span> <span class="k">inbounds</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%2</span><span class="p">,</span> <span class="kt">i64</span> <span class="nv">%12</span>
-  <span class="nv">%14</span> <span class="p">=</span> <span class="k">load</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%13</span><span class="p">,</span> <span class="k">align</span> <span class="m">4</span><span class="p">,</span> <span class="nv">!tbaa</span> <span class="nv">!2</span>
-  <span class="nv">%15</span> <span class="p">=</span> <span class="k">getelementptr</span> <span class="k">inbounds</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%1</span><span class="p">,</span> <span class="kt">i64</span> <span class="nv">%12</span>
-  <span class="nv">%16</span> <span class="p">=</span> <span class="k">load</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%15</span><span class="p">,</span> <span class="k">align</span> <span class="m">4</span><span class="p">,</span> <span class="nv">!tbaa</span> <span class="nv">!6</span>
+  <span class="nv">%13</span> <span class="p">=</span> <span class="k">getelementptr</span> <span class="k">inbounds</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">add</span><span class="err">rspa</span><span class="k">c</span><span class="err">e</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%2</span><span class="p">,</span> <span class="kt">i64</span> <span class="nv">%12</span>
+  <span class="nv">%14</span> <span class="p">=</span> <span class="k">load</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">add</span><span class="err">rspa</span><span class="k">c</span><span class="err">e</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%13</span><span class="p">,</span> <span class="k">align</span> <span class="m">4</span><span class="p">,</span> <span class="nv" [...]
+  <span class="nv">%15</span> <span class="p">=</span> <span class="k">getelementptr</span> <span class="k">inbounds</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">add</span><span class="err">rspa</span><span class="k">c</span><span class="err">e</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%1</span><span class="p">,</span> <span class="kt">i64</span> <span class="nv">%12</span>
+  <span class="nv">%16</span> <span class="p">=</span> <span class="k">load</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">add</span><span class="err">rspa</span><span class="k">c</span><span class="err">e</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%15</span><span class="p">,</span> <span class="k">align</span> <span class="m">4</span><span class="p">,</span> <span class="nv" [...]
   <span class="nv">%17</span> <span class="p">=</span> <span class="k">fadd</span> <span class="kt">float</span> <span class="nv">%14</span><span class="p">,</span> <span class="nv">%16</span>
   <span class="nv">%18</span> <span class="p">=</span> <span class="k">sext</span> <span class="kt">i32</span> <span class="nv">%10</span> <span class="k">to</span> <span class="kt">i64</span>
-  <span class="nv">%19</span> <span class="p">=</span> <span class="k">getelementptr</span> <span class="k">inbounds</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%0</span><span class="p">,</span> <span class="kt">i64</span> <span class="nv">%18</span>
-  <span class="k">store</span> <span class="kt">float</span> <span class="nv">%17</span><span class="p">,</span> <span class="kt">float</span> <span class="k">addrspace</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%19</span><span class="p">,</span> <span class="k">align</span> <span class="m">4</span><span class="p">,</span> <span class="nv">!tbaa</span> <span class="nv">!9</span>
+  <span class="nv">%19</span> <span class="p">=</span> <span class="k">getelementptr</span> <span class="k">inbounds</span> <span class="kt">float</span><span class="p">,</span> <span class="kt">float</span> <span class="k">add</span><span class="err">rspa</span><span class="k">c</span><span class="err">e</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%0</span><span class="p">,</span> <span class="kt">i64</span> <span class="nv">%18</span>
+  <span class="k">store</span> <span class="kt">float</span> <span class="nv">%17</span><span class="p">,</span> <span class="kt">float</span> <span class="k">add</span><span class="err">rspa</span><span class="k">c</span><span class="err">e</span><span class="p">(</span><span class="m">1</span><span class="p">)*</span> <span class="nv">%19</span><span class="p">,</span> <span class="k">align</span> <span class="m">4</span><span class="p">,</span> <span class="nv">!tbaa</span> <span clas [...]
   <span class="k">br</span> <span class="kt">label</span> <span class="nv">%if_end</span>
 
 
diff --git a/2020/07/14/bert-pytorch-tvm.html b/2020/07/14/bert-pytorch-tvm.html
index be00c9e..66f37bd 100644
--- a/2020/07/14/bert-pytorch-tvm.html
+++ b/2020/07/14/bert-pytorch-tvm.html
@@ -260,12 +260,12 @@ Now it’s in the region of 6.5-7ms per run, similar to PyTorch. This is what we
 <p>Like many deep learning models, BERT comes with a bit some prologue (vocabulary embeddings) and epilogue (pooling) and the bulk is organized into similar-looking blocks, here we have 12 <code class="highlighter-rouge">BertLayer</code> modules.
 The <code class="highlighter-rouge">attention_mask</code> is jsut to prevent BERT from looking at the answer when dealing with the question.</p>
 
-<p><img src="/images/bert-pytorch/bert_model.svg" alt="Bert Model" /></p>
+<p><img src="/images/bert-pytorch/bert_model.svg" alt="Bert Model" width="100%" /></p>
 
 <p>So let us zoom in and look at a BertLayer in detail, since that ultimately is what we need make fast.
 As we see in the net diagram, the main part of the <code class="highlighter-rouge">BertLayer</code> module is a submodule <code class="highlighter-rouge">BertSelfAttention</code>.</p>
 
-<p><img src="/images/bert-pytorch/bert_layer.svg" alt="BertLayer" /></p>
+<p><img src="/images/bert-pytorch/bert_layer.svg" alt="BertLayer" width="100%" /></p>
 
 <p>Now the <code class="highlighter-rouge">BertSelfAttention</code> captures the famed self-attention mechanism that is the hallmark of transformer models. (I cannot recommend Sascha Rush’s <a href="http://nlp.seas.harvard.edu/2018/04/03/attention.html">Annotated Transformer</a> enough as a detailed walkthrough.)</p>
 
@@ -312,7 +312,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
         <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">runtime</span><span class="o">.</span><span class="n">ndarray</span><span class="o">.</span><span class="n">NDArray</span><span class="p">):</span>
             <span class="k">return</span> <span class="n">numpy</span><span class="o">.</span><span class="n">prod</span><span class="p">(</span><span class="n">c</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span> <span class="o">&lt;</span> <span class="mi">10</span>
         <span class="k">return</span> <span class="bp">True</span>
-            
+
     <span class="c1"># Sort by node ID
 </span>    <span class="k">for</span> <span class="n">node</span><span class="p">,</span> <span class="n">node_id</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">node_dict</span><span class="o">.</span><span class="n">items</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><s [...]
         <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Function</span><span class="p">):</span>
@@ -339,7 +339,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
             <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">node</span><span class="o">.</span><span class="n">fields</span><span class="p">:</span>
                 <span class="n">dot</span><span class="o">.</span><span class="n">edge</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_dict</span><span class="p">[</span><span class="n">field</span><span class="p">]),</span> <span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">))</span>
         <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="n">tvm</span><span class="o">.</span><span class="n">relay</span><span class="o">.</span><span class="n">Constant</span><span class="p">):</span>
-            
+
             <span class="k">if</span> <span class="ow">not</span> <span class="n">is_small_const</span><span class="p">(</span><span class="n">node</span><span class="p">):</span> <span class="c1"># small consts are shown in ops
 </span>                <span class="n">dot</span><span class="o">.</span><span class="n">node</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">node_id</span><span class="p">),</span> <span class="s">'Constant({}, {})'</span><span class="o">.</span><span class="nb">format</span><span class="p">(</span><span class="n">node</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class= [...]
                         <span class="o">**</span><span class="n">node_attr_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">node</span><span class="p">,</span> <span class="p">{}))</span>
@@ -396,7 +396,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
 <div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">visualize</span><span class="p">(</span><span class="n">mod</span><span class="p">[</span><span class="s">'main'</span><span class="p">])</span>
 </code></pre></div></div>
 
-<p><img src="/images/bert-pytorch/bert-tvm_49_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/bert-tvm_49_0.svg" alt="svg" width="100%" /></p>
 
 <p>In addition to our named inputs, we see a number of unnamed (numbered) variables. These are the neural network parameters.</p>
 
@@ -410,13 +410,13 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
 
 <p>One thing we see from the picture is that the input is reshaped three times. There is a TVM optimization pass call Common Subexpression Elimination (CSE) that combines the three reshapes.
 (A while ago, this did not succeed because it had distinct shape arguments, but this was since solved by the TVM developers in the dynamic to static conversion pass.)
-Also, the model parameters that are reshaped and transposed. Can we get rid of that, too? 
-Yes. And for that we would first <em>bind</em> the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes. 
+Also, the model parameters that are reshaped and transposed. Can we get rid of that, too?
+Yes. And for that we would first <em>bind</em> the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes.
 With the <code class="highlighter-rouge">Foldconstant</code> pass, we can propagate the constants through the <code class="highlighter-rouge">transpose</code>s and <code class="highlighter-rouge">reshape</code>s to move them closer to the matmuls.</p>
 
 <p>After these three (which TVM will do when we compile a relay model), our model looks like this:</p>
 
-<p><img src="/images/bert-pytorch/bert-tvm_72_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/bert-tvm_72_0.svg" alt="svg" width="100%" /></p>
 
 <p>And now comes an interesting trick. It is more efficient to merge the three batch matmuls with the same input into a single <code class="highlighter-rouge">batch_matmul</code>. We implemented a pass doing this in <a href="https://github.com/apache/incubator-tvm/pull/5791">TVM PR 5791</a>. So let’s call it and also have another constant-folding pass.</p>
 
@@ -425,7 +425,7 @@ With the <code class="highlighter-rouge">Foldconstant</code> pass, we can propag
 <span class="n">visualize</span><span class="p">(</span><span class="n">new_mod</span><span class="p">[</span><span class="s">"main"</span><span class="p">])</span>
 </code></pre></div></div>
 
-<p><img src="/images/bert-pytorch/bert-tvm_74_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/bert-tvm_74_0.svg" alt="svg" width="100%" /></p>
 
 <p>Awesome. After checking that we still get the same result.
 We can time again: 25.2 ms for 100 runs. It’s a bit slow again because we need to tune for the new shapes.
@@ -489,7 +489,7 @@ Again, we get our relay model with running a traced <code class="highlighter-rou
 <p>One thing we’ll do in between is to move from a modular interface in PyTorch - with named parameters - to a functional
 interface (which is what TVM can do for us). The first thing we want to do for that is arrange for the function arguments to be in an order that we can work with - i.e. first the direct inputs to the module and then the parameters in the same order that PyTorch uses them. After this operation, our <code class="highlighter-rouge">BertLayer </code> in TVM looks like this:</p>
 
-<p><img src="/images/bert-pytorch/pytorch-tvm-training_20_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_20_0.svg" alt="svg" width="100%" /></p>
 
 <p>As in the BERT inference, we want to run some optimization passes.</p>
 
@@ -506,7 +506,7 @@ interface (which is what TVM can do for us). The first thing we want to do for t
 
 <p>With these modificaitons applied, our model looks like this:</p>
 
-<p><img src="/images/bert-pytorch/pytorch-tvm-training_25_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_25_0.svg" alt="svg" width="100%" /></p>
 
 <p>Finally we can take the grad. As we get a lot of <code class="highlighter-rouge">let</code> nodes, we bring it to normal form using the <code class="highlighter-rouge">ToGraphNormalForm</code> pass.
 TVM’s gradient-taking returns a function that has the same parameters as the original function (in our case amended with the <code class="highlighter-rouge">grad_out</code> and dropout) and then returns a tuple of the original return and a tuple containing gradients for all inputs.
@@ -515,9 +515,9 @@ Then we run our simplification passes.</p>
 
 <p>So this is the graph we have now for forward and backward:</p>
 
-<p><img src="/images/bert-pytorch/pytorch-tvm-training_31_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_31_0.svg" alt="svg" width="100%" /></p>
 
-<p>But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and 
+<p>But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and
 split our graph. One of the difficult problems is what to do with things computed for both forward and backward. It is a hard problem, related to the MinCut problem.</p>
 
 <p>Our extremal options could be:</p>
@@ -532,7 +532,7 @@ split our graph. One of the difficult problems is what to do with things compute
 
 <p>A bit of (PyTorch) terminology: When we have a function <em>Layer : x ↦ y</em> followed by some <em>Loss: y ↦ l ∈ ℝ</em>, the backward is <em>BackwardOfLayer : grad<code class="highlighter-rouge">_</code>out ↦ grad<code class="highlighter-rouge">_</code>in</em> with <em>grad<code class="highlighter-rouge">_</code>out = dl/dy</em> and *grad<code class="highlighter-rouge">_</code>in = dl/dx`.</p>
 
-<p><img src="/images/bert-pytorch/pytorch-tvm-training_34_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_34_0.svg" alt="svg" width="100%" /></p>
 
 <p>In order to split the function as described above, we collect the blue nodes as to capture - but constants will
 just be duplicated and inputs (<code class="highlighter-rouge">Var</code> nodes) need to be treated separately.
@@ -540,7 +540,7 @@ Now we can split out the backward, replacing all the blue nodes with variables.<
 
 <p>Next we take the forward and amend it to also return the required intermediates. The forward then looks like this:</p>
 
-<p><img src="/images/bert-pytorch/pytorch-tvm-training_40_0.svg" alt="svg" /></p>
+<p><img src="/images/bert-pytorch/pytorch-tvm-training_40_0.svg" alt="svg" width="100%" /></p>
 
 <p>TVM cannot return nested tuples, so we flatten the output in the function. Again we differentiate between tensor-valued functions and tuple valued ones (i.e. those returning potentially multiple tensors).</p>
 
@@ -564,7 +564,7 @@ Time to give it a spin. We define convenience functions to move tensors between
 <span class="n">drop_c</span> <span class="o">=</span> <span class="p">{}</span>
 <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">dropout_info</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span> <span class="c1"># we don't know the order
 </span>    <span class="n">p</span><span class="p">,</span> <span class="n">typ</span> <span class="o">=</span> <span class="n">dropout_info</span><span class="p">[</span><span class="n">k</span><span class="p">]</span>
-    <span class="n">drop_c</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">dropout</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">([</span><span class="nb">int</span><span class="p">(< [...]
+    <span class="n">drop_c</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">dropout</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">([</span><span class="nb">int</span><span class="p">(< [...]
                                               <span class="n">dtype</span><span class="o">=</span><span class="nb">getattr</span><span class="p">(</span><span class="n">torch</span><span class="p">,</span> <span class="n">typ</span><span class="o">.</span><span class="n">dtype</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s">"cuda"</span><span class="p">),</span> <span class="n">p</span><span class="o">=</span><span class="n">p</span><s [...]
 
 <span class="n">drop_tvm</span> <span class="o">=</span> <span class="p">{</span><span class="n">n</span><span class="p">:</span> <span class="n">tensor_to_tvm</span><span class="p">(</span><span class="n">t</span><span class="p">)</span> <span class="k">for</span> <span class="n">n</span><span class="p">,</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">drop_c</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
diff --git a/atom.xml b/atom.xml
index b91c889..e105120 100644
--- a/atom.xml
+++ b/atom.xml
@@ -4,7 +4,7 @@
  <title>TVM</title>
  <link href="https://tvm.apache.org" rel="self"/>
  <link href="https://tvm.apache.org"/>
- <updated>2020-07-14T09:04:35-07:00</updated>
+ <updated>2020-07-14T09:12:02-07:00</updated>
  <id>https://tvm.apache.org</id>
  <author>
    <name></name>
@@ -115,12 +115,12 @@ Now it’s in the region of 6.5-7ms per run, similar to PyTorch. This is what we
 &lt;p&gt;Like many deep learning models, BERT comes with a bit some prologue (vocabulary embeddings) and epilogue (pooling) and the bulk is organized into similar-looking blocks, here we have 12 &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; modules.
 The &lt;code class=&quot;highlighter-rouge&quot;&gt;attention_mask&lt;/code&gt; is jsut to prevent BERT from looking at the answer when dealing with the question.&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_model.svg&quot; alt=&quot;Bert Model&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_model.svg&quot; alt=&quot;Bert Model&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;So let us zoom in and look at a BertLayer in detail, since that ultimately is what we need make fast.
 As we see in the net diagram, the main part of the &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; module is a submodule &lt;code class=&quot;highlighter-rouge&quot;&gt;BertSelfAttention&lt;/code&gt;.&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_layer.svg&quot; alt=&quot;BertLayer&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_layer.svg&quot; alt=&quot;BertLayer&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;Now the &lt;code class=&quot;highlighter-rouge&quot;&gt;BertSelfAttention&lt;/code&gt; captures the famed self-attention mechanism that is the hallmark of transformer models. (I cannot recommend Sascha Rush’s &lt;a href=&quot;http://nlp.seas.harvard.edu/2018/04/03/attention.html&quot;&gt;Annotated Transformer&lt;/a&gt; enough as a detailed walkthrough.)&lt;/p&gt;
 
@@ -167,7 +167,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
         &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;runtime&lt;/span&gt; [...]
             &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;prod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; [...]
         &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;
-            
+
     &lt;span class=&quot;c1&quot;&gt;# Sort by node ID
 &lt;/span&gt;    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;sorted&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&g [...]
         &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Function&lt;/spa [...]
@@ -194,7 +194,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
             &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;field&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fields&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
                 &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;field&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/sp [...]
         &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Constant&lt;/s [...]
-            
+
             &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;not&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;is_small_const&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# small consts are shown in ops
 &lt;/span&gt;                &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'Constant({}, {})'&lt;/span&gt;&lt;span class=& [...]
                         &lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_attr_dict&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{}))&lt;/span&gt;
@@ -251,7 +251,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;visualize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'main'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;])&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_49_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_49_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;In addition to our named inputs, we see a number of unnamed (numbered) variables. These are the neural network parameters.&lt;/p&gt;
 
@@ -265,13 +265,13 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
 
 &lt;p&gt;One thing we see from the picture is that the input is reshaped three times. There is a TVM optimization pass call Common Subexpression Elimination (CSE) that combines the three reshapes.
 (A while ago, this did not succeed because it had distinct shape arguments, but this was since solved by the TVM developers in the dynamic to static conversion pass.)
-Also, the model parameters that are reshaped and transposed. Can we get rid of that, too? 
-Yes. And for that we would first &lt;em&gt;bind&lt;/em&gt; the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes. 
+Also, the model parameters that are reshaped and transposed. Can we get rid of that, too?
+Yes. And for that we would first &lt;em&gt;bind&lt;/em&gt; the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes.
 With the &lt;code class=&quot;highlighter-rouge&quot;&gt;Foldconstant&lt;/code&gt; pass, we can propagate the constants through the &lt;code class=&quot;highlighter-rouge&quot;&gt;transpose&lt;/code&gt;s and &lt;code class=&quot;highlighter-rouge&quot;&gt;reshape&lt;/code&gt;s to move them closer to the matmuls.&lt;/p&gt;
 
 &lt;p&gt;After these three (which TVM will do when we compile a relay model), our model looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_72_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_72_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;And now comes an interesting trick. It is more efficient to merge the three batch matmuls with the same input into a single &lt;code class=&quot;highlighter-rouge&quot;&gt;batch_matmul&lt;/code&gt;. We implemented a pass doing this in &lt;a href=&quot;https://github.com/apache/incubator-tvm/pull/5791&quot;&gt;TVM PR 5791&lt;/a&gt;. So let’s call it and also have another constant-folding pass.&lt;/p&gt;
 
@@ -280,7 +280,7 @@ With the &lt;code class=&quot;highlighter-rouge&quot;&gt;Foldconstant&lt;/code&g
 &lt;span class=&quot;n&quot;&gt;visualize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;new_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;main&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;])&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_74_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_74_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;Awesome. After checking that we still get the same result.
 We can time again: 25.2 ms for 100 runs. It’s a bit slow again because we need to tune for the new shapes.
@@ -344,7 +344,7 @@ Again, we get our relay model with running a traced &lt;code class=&quot;highlig
 &lt;p&gt;One thing we’ll do in between is to move from a modular interface in PyTorch - with named parameters - to a functional
 interface (which is what TVM can do for us). The first thing we want to do for that is arrange for the function arguments to be in an order that we can work with - i.e. first the direct inputs to the module and then the parameters in the same order that PyTorch uses them. After this operation, our &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer &lt;/code&gt; in TVM looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_20_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_20_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;As in the BERT inference, we want to run some optimization passes.&lt;/p&gt;
 
@@ -361,7 +361,7 @@ interface (which is what TVM can do for us). The first thing we want to do for t
 
 &lt;p&gt;With these modificaitons applied, our model looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_25_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_25_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;Finally we can take the grad. As we get a lot of &lt;code class=&quot;highlighter-rouge&quot;&gt;let&lt;/code&gt; nodes, we bring it to normal form using the &lt;code class=&quot;highlighter-rouge&quot;&gt;ToGraphNormalForm&lt;/code&gt; pass.
 TVM’s gradient-taking returns a function that has the same parameters as the original function (in our case amended with the &lt;code class=&quot;highlighter-rouge&quot;&gt;grad_out&lt;/code&gt; and dropout) and then returns a tuple of the original return and a tuple containing gradients for all inputs.
@@ -370,9 +370,9 @@ Then we run our simplification passes.&lt;/p&gt;
 
 &lt;p&gt;So this is the graph we have now for forward and backward:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_31_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_31_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
-&lt;p&gt;But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and 
+&lt;p&gt;But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and
 split our graph. One of the difficult problems is what to do with things computed for both forward and backward. It is a hard problem, related to the MinCut problem.&lt;/p&gt;
 
 &lt;p&gt;Our extremal options could be:&lt;/p&gt;
@@ -387,7 +387,7 @@ split our graph. One of the difficult problems is what to do with things compute
 
 &lt;p&gt;A bit of (PyTorch) terminology: When we have a function &lt;em&gt;Layer : x ↦ y&lt;/em&gt; followed by some &lt;em&gt;Loss: y ↦ l ∈ ℝ&lt;/em&gt;, the backward is &lt;em&gt;BackwardOfLayer : grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;out ↦ grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;in&lt;/em&gt; with &lt;em&gt;grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;out = dl/dy&lt;/em&gt; and *grad&lt;code class=&quot;highlig [...]
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_34_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_34_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;In order to split the function as described above, we collect the blue nodes as to capture - but constants will
 just be duplicated and inputs (&lt;code class=&quot;highlighter-rouge&quot;&gt;Var&lt;/code&gt; nodes) need to be treated separately.
@@ -395,7 +395,7 @@ Now we can split out the backward, replacing all the blue nodes with variables.&
 
 &lt;p&gt;Next we take the forward and amend it to also return the required intermediates. The forward then looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_40_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_40_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;TVM cannot return nested tuples, so we flatten the output in the function. Again we differentiate between tensor-valued functions and tuple valued ones (i.e. those returning potentially multiple tensors).&lt;/p&gt;
 
@@ -419,7 +419,7 @@ Time to give it a spin. We define convenience functions to move tensors between
 &lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{}&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dropout_info&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;keys&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;():&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# we don't know the order
 &lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;p&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;typ&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dropout_info&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;functional&lt;/span&gt;&lt;spa [...]
+    &lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;functional&lt;/span&gt;&lt;spa [...]
                                               &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;getattr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;typ&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span c [...]
 
 &lt;span class=&quot;n&quot;&gt;drop_tvm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_to_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;sp [...]
@@ -3901,13 +3901,13 @@ We are starting to look at performance optimization and we expect more improveme
 &lt;p&gt;You should see something like this:&lt;/p&gt;
 
 &lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-llvm&quot; data-lang=&quot;llvm&quot;&gt;&lt;span class=&quot;c1&quot;&gt;; ModuleID = 'myadd__kernel0'&lt;/span&gt;
-&lt;span class=&quot;err&quot;&gt;source_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
+&lt;span class=&quot;err&quot;&gt;sour&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;datalayout&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64&quot;&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;triple&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;amdgcn-amd-amdhsa-hcc&quot;&lt;/span&gt;
 
 
 &lt;span class=&quot;c1&quot;&gt;; Function Attrs: nounwind&lt;/span&gt;
-&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;err&quot;&gt;amdgpu_kernel&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class [...]
+&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;err&quot;&gt;amdgpu_ker&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;ne&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;l&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k [...]
 &lt;span class=&quot;nl&quot;&gt;entry:&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workgroup.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workitem.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
@@ -3927,14 +3927,14 @@ We are starting to look at performance optimization and we expect more improveme
   &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%12&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
-  &lt;span class=&quot;nv&quot;&gt;%13&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&g [...]
-  &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)*&lt;/span&gt; [...]
-  &lt;span class=&quot;nv&quot;&gt;%15&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&g [...]
-  &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)*&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%13&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%15&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
   &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;fadd&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%18&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
-  &lt;span class=&quot;nv&quot;&gt;%19&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&g [...]
-  &lt;span class=&quot;k&quot;&gt;store&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)*&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%19&lt;/span [...]
+  &lt;span class=&quot;nv&quot;&gt;%19&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;k&quot;&gt;store&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt; [...]
   &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end&lt;/span&gt;
 
 
diff --git a/rss.xml b/rss.xml
index fb7b368..c9886c9 100644
--- a/rss.xml
+++ b/rss.xml
@@ -5,8 +5,8 @@
         <description>TVM - </description>
         <link>https://tvm.apache.org</link>
         <atom:link href="https://tvm.apache.org" rel="self" type="application/rss+xml" />
-        <lastBuildDate>Tue, 14 Jul 2020 09:04:35 -0700</lastBuildDate>
-        <pubDate>Tue, 14 Jul 2020 09:04:35 -0700</pubDate>
+        <lastBuildDate>Tue, 14 Jul 2020 09:12:02 -0700</lastBuildDate>
+        <pubDate>Tue, 14 Jul 2020 09:12:02 -0700</pubDate>
         <ttl>60</ttl>
 
 
@@ -110,12 +110,12 @@ Now it’s in the region of 6.5-7ms per run, similar to PyTorch. This is what we
 &lt;p&gt;Like many deep learning models, BERT comes with a bit some prologue (vocabulary embeddings) and epilogue (pooling) and the bulk is organized into similar-looking blocks, here we have 12 &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; modules.
 The &lt;code class=&quot;highlighter-rouge&quot;&gt;attention_mask&lt;/code&gt; is jsut to prevent BERT from looking at the answer when dealing with the question.&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_model.svg&quot; alt=&quot;Bert Model&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_model.svg&quot; alt=&quot;Bert Model&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;So let us zoom in and look at a BertLayer in detail, since that ultimately is what we need make fast.
 As we see in the net diagram, the main part of the &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer&lt;/code&gt; module is a submodule &lt;code class=&quot;highlighter-rouge&quot;&gt;BertSelfAttention&lt;/code&gt;.&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_layer.svg&quot; alt=&quot;BertLayer&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert_layer.svg&quot; alt=&quot;BertLayer&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;Now the &lt;code class=&quot;highlighter-rouge&quot;&gt;BertSelfAttention&lt;/code&gt; captures the famed self-attention mechanism that is the hallmark of transformer models. (I cannot recommend Sascha Rush’s &lt;a href=&quot;http://nlp.seas.harvard.edu/2018/04/03/attention.html&quot;&gt;Annotated Transformer&lt;/a&gt; enough as a detailed walkthrough.)&lt;/p&gt;
 
@@ -162,7 +162,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
         &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;runtime&lt;/span&gt; [...]
             &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numpy&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;prod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt; [...]
         &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;True&lt;/span&gt;
-            
+
     &lt;span class=&quot;c1&quot;&gt;# Sort by node ID
 &lt;/span&gt;    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;sorted&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&g [...]
         &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Function&lt;/spa [...]
@@ -189,7 +189,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
             &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;field&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fields&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
                 &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;edge&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_dict&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;field&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]),&lt;/sp [...]
         &lt;span class=&quot;k&quot;&gt;elif&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;isinstance&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tvm&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;relay&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Constant&lt;/s [...]
-            
+
             &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;not&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;is_small_const&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# small consts are shown in ops
 &lt;/span&gt;                &lt;span class=&quot;n&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;str&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_id&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;'Constant({}, {})'&lt;/span&gt;&lt;span class=& [...]
                         &lt;span class=&quot;o&quot;&gt;**&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node_attr_dict&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;get&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;node&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{}))&lt;/span&gt;
@@ -246,7 +246,7 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
 &lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;visualize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;'main'&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;])&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_49_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_49_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;In addition to our named inputs, we see a number of unnamed (numbered) variables. These are the neural network parameters.&lt;/p&gt;
 
@@ -260,13 +260,13 @@ We grab the inputs of a BertLayer (see the Notebook for how) and convert a singl
 
 &lt;p&gt;One thing we see from the picture is that the input is reshaped three times. There is a TVM optimization pass call Common Subexpression Elimination (CSE) that combines the three reshapes.
 (A while ago, this did not succeed because it had distinct shape arguments, but this was since solved by the TVM developers in the dynamic to static conversion pass.)
-Also, the model parameters that are reshaped and transposed. Can we get rid of that, too? 
-Yes. And for that we would first &lt;em&gt;bind&lt;/em&gt; the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes. 
+Also, the model parameters that are reshaped and transposed. Can we get rid of that, too?
+Yes. And for that we would first &lt;em&gt;bind&lt;/em&gt; the parameters, i.e. put them into the model. Then the parameters have become constants instead of input nodes.
 With the &lt;code class=&quot;highlighter-rouge&quot;&gt;Foldconstant&lt;/code&gt; pass, we can propagate the constants through the &lt;code class=&quot;highlighter-rouge&quot;&gt;transpose&lt;/code&gt;s and &lt;code class=&quot;highlighter-rouge&quot;&gt;reshape&lt;/code&gt;s to move them closer to the matmuls.&lt;/p&gt;
 
 &lt;p&gt;After these three (which TVM will do when we compile a relay model), our model looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_72_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_72_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;And now comes an interesting trick. It is more efficient to merge the three batch matmuls with the same input into a single &lt;code class=&quot;highlighter-rouge&quot;&gt;batch_matmul&lt;/code&gt;. We implemented a pass doing this in &lt;a href=&quot;https://github.com/apache/incubator-tvm/pull/5791&quot;&gt;TVM PR 5791&lt;/a&gt;. So let’s call it and also have another constant-folding pass.&lt;/p&gt;
 
@@ -275,7 +275,7 @@ With the &lt;code class=&quot;highlighter-rouge&quot;&gt;Foldconstant&lt;/code&g
 &lt;span class=&quot;n&quot;&gt;visualize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;new_mod&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;main&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;])&lt;/span&gt;
 &lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_74_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/bert-tvm_74_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;Awesome. After checking that we still get the same result.
 We can time again: 25.2 ms for 100 runs. It’s a bit slow again because we need to tune for the new shapes.
@@ -339,7 +339,7 @@ Again, we get our relay model with running a traced &lt;code class=&quot;highlig
 &lt;p&gt;One thing we’ll do in between is to move from a modular interface in PyTorch - with named parameters - to a functional
 interface (which is what TVM can do for us). The first thing we want to do for that is arrange for the function arguments to be in an order that we can work with - i.e. first the direct inputs to the module and then the parameters in the same order that PyTorch uses them. After this operation, our &lt;code class=&quot;highlighter-rouge&quot;&gt;BertLayer &lt;/code&gt; in TVM looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_20_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_20_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;As in the BERT inference, we want to run some optimization passes.&lt;/p&gt;
 
@@ -356,7 +356,7 @@ interface (which is what TVM can do for us). The first thing we want to do for t
 
 &lt;p&gt;With these modificaitons applied, our model looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_25_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_25_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;Finally we can take the grad. As we get a lot of &lt;code class=&quot;highlighter-rouge&quot;&gt;let&lt;/code&gt; nodes, we bring it to normal form using the &lt;code class=&quot;highlighter-rouge&quot;&gt;ToGraphNormalForm&lt;/code&gt; pass.
 TVM’s gradient-taking returns a function that has the same parameters as the original function (in our case amended with the &lt;code class=&quot;highlighter-rouge&quot;&gt;grad_out&lt;/code&gt; and dropout) and then returns a tuple of the original return and a tuple containing gradients for all inputs.
@@ -365,9 +365,9 @@ Then we run our simplification passes.&lt;/p&gt;
 
 &lt;p&gt;So this is the graph we have now for forward and backward:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_31_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_31_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
-&lt;p&gt;But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and 
+&lt;p&gt;But in PyTorch, we first compute the forward and then the backwards, so we have to take out the saw and
 split our graph. One of the difficult problems is what to do with things computed for both forward and backward. It is a hard problem, related to the MinCut problem.&lt;/p&gt;
 
 &lt;p&gt;Our extremal options could be:&lt;/p&gt;
@@ -382,7 +382,7 @@ split our graph. One of the difficult problems is what to do with things compute
 
 &lt;p&gt;A bit of (PyTorch) terminology: When we have a function &lt;em&gt;Layer : x ↦ y&lt;/em&gt; followed by some &lt;em&gt;Loss: y ↦ l ∈ ℝ&lt;/em&gt;, the backward is &lt;em&gt;BackwardOfLayer : grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;out ↦ grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;in&lt;/em&gt; with &lt;em&gt;grad&lt;code class=&quot;highlighter-rouge&quot;&gt;_&lt;/code&gt;out = dl/dy&lt;/em&gt; and *grad&lt;code class=&quot;highlig [...]
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_34_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_34_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;In order to split the function as described above, we collect the blue nodes as to capture - but constants will
 just be duplicated and inputs (&lt;code class=&quot;highlighter-rouge&quot;&gt;Var&lt;/code&gt; nodes) need to be treated separately.
@@ -390,7 +390,7 @@ Now we can split out the backward, replacing all the blue nodes with variables.&
 
 &lt;p&gt;Next we take the forward and amend it to also return the required intermediates. The forward then looks like this:&lt;/p&gt;
 
-&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_40_0.svg&quot; alt=&quot;svg&quot; /&gt;&lt;/p&gt;
+&lt;p&gt;&lt;img src=&quot;/images/bert-pytorch/pytorch-tvm-training_40_0.svg&quot; alt=&quot;svg&quot; width=&quot;100%&quot; /&gt;&lt;/p&gt;
 
 &lt;p&gt;TVM cannot return nested tuples, so we flatten the output in the function. Again we differentiate between tensor-valued functions and tuple valued ones (i.e. those returning potentially multiple tensors).&lt;/p&gt;
 
@@ -414,7 +414,7 @@ Time to give it a spin. We define convenience functions to move tensors between
 &lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{}&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;k&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dropout_info&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;keys&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;():&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;# we don't know the order
 &lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;p&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;typ&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dropout_info&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
-    &lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;functional&lt;/span&gt;&lt;spa [...]
+    &lt;span class=&quot;n&quot;&gt;drop_c&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;k&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nn&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;functional&lt;/span&gt;&lt;spa [...]
                                               &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;getattr&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;torch&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;typ&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span c [...]
 
 &lt;span class=&quot;n&quot;&gt;drop_tvm&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;n&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tensor_to_tvm&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;sp [...]
@@ -3896,13 +3896,13 @@ We are starting to look at performance optimization and we expect more improveme
 &lt;p&gt;You should see something like this:&lt;/p&gt;
 
 &lt;figure class=&quot;highlight&quot;&gt;&lt;pre&gt;&lt;code class=&quot;language-llvm&quot; data-lang=&quot;llvm&quot;&gt;&lt;span class=&quot;c1&quot;&gt;; ModuleID = 'myadd__kernel0'&lt;/span&gt;
-&lt;span class=&quot;err&quot;&gt;source_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
+&lt;span class=&quot;err&quot;&gt;sour&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e_filename&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;myadd__kernel0&quot;&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;datalayout&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64&quot;&lt;/span&gt;
 &lt;span class=&quot;k&quot;&gt;target&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;triple&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;amdgcn-amd-amdhsa-hcc&quot;&lt;/span&gt;
 
 
 &lt;span class=&quot;c1&quot;&gt;; Function Attrs: nounwind&lt;/span&gt;
-&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;err&quot;&gt;amdgpu_kernel&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class [...]
+&lt;span class=&quot;k&quot;&gt;define&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;dllexport&lt;/span&gt; &lt;span class=&quot;err&quot;&gt;amdgpu_ker&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;ne&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;l&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@myadd__kernel0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k [...]
 &lt;span class=&quot;nl&quot;&gt;entry:&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%4&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workgroup.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;tail&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;call&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;vg&quot;&gt;@llvm.amdgcn.workitem.id.x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
@@ -3922,14 +3922,14 @@ We are starting to look at performance optimization and we expect more improveme
   &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;nsw&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%.pre-phi&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%5&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%12&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%11&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
-  &lt;span class=&quot;nv&quot;&gt;%13&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&g [...]
-  &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)*&lt;/span&gt; [...]
-  &lt;span class=&quot;nv&quot;&gt;%15&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&g [...]
-  &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)*&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%13&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
+  &lt;span class=&quot;nv&quot;&gt;%15&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;load&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt; [...]
   &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;fadd&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%14&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%16&lt;/span&gt;
   &lt;span class=&quot;nv&quot;&gt;%18&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;sext&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i32&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%10&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;to&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;i64&lt;/span&gt;
-  &lt;span class=&quot;nv&quot;&gt;%19&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&g [...]
-  &lt;span class=&quot;k&quot;&gt;store&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;addrspace&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;m&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)*&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%19&lt;/span [...]
+  &lt;span class=&quot;nv&quot;&gt;%19&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;getelementptr&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;inbounds&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt [...]
+  &lt;span class=&quot;k&quot;&gt;store&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%17&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;add&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;rspa&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;c&lt;/span&gt;&lt;span class=&quot;err&quot;&gt;e&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt; [...]
   &lt;span class=&quot;k&quot;&gt;br&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;label&lt;/span&gt; &lt;span class=&quot;nv&quot;&gt;%if_end&lt;/span&gt;