You are viewing a plain text version of this content. The canonical link for it is here.
Posted to discuss-archive@tvm.apache.org by Josse Van Delm via Apache TVM Discuss <no...@discuss.tvm.ai> on 2021/03/08 11:39:53 UTC

[Apache TVM Discuss] [Questions] [TE] Tensorize Elementwise Sum


Hi everyone,

I'm currently trying to tensorize the schedule for a very simple  [4,4] matrix element-wise sum (add) to be performed in 4 [2,2] matrix addition steps by an intrinsic function. I've looked into adapting the tutorial on [Tensorization](https://tvm.apache.org/docs/tutorials/language/tensorize.html) but I cannot get a schedule that compiles as I think I'm not correctly creating buffers (the code gets stuck in the StorageFlattener step). Currently I came up with this schedule, but it doesn't compile (python source code below).:
```
primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [4, 4], []),
             B: Buffer(B_2: Pointer(float32), float32, [4, 4], []),
             A: Buffer(A_2: Pointer(float32), float32, [4, 4], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  attr [C] "realize_scope" = "";
  realize(C, [0:4, 0:4], True {
    for (i.outer: int32, 0, 2) {
      for (j.outer: int32, 0, 2) {
        attr [[A_3: Buffer(A_4: Pointer(float32), float32, [2, 2], [2, 1], elem_offset=A_elem_offset: int32), A]] "buffer_bind_scope" = @tir.tvm_tuple((i.outer*2), 2, (j.outer*2), 2, dtype=handle);
        attr [[B_3: Buffer(B_4: Pointer(float32), float32, [2, 2], [2, 1], elem_offset=B_elem_offset: int32), B]] "buffer_bind_scope" = @tir.tvm_tuple((i.outer*2), 2, (j.outer*2), 2, dtype=handle);
        attr [[C_3: Buffer(C_4: Pointer(float32), float32, [2, 2], [2, 1], elem_offset=C_elem_offset: int32), C]] "buffer_bind_scope" = @tir.tvm_tuple((i.outer*2), 2, (j.outer*2), 2, dtype=handle);
        @tir.call_extern("ews", 
        	@tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_4, C_elem_offset, 4, 2, dtype=handle), 
        	@tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_4, A_elem_offset, 4, 1, dtype=handle), 
        	@tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_4, B_elem_offset, 4, 1, dtype=handle), 
            2, 
            2, 
            2, 
            dtype=float32)
      }
    }
  })
}
```
Python source:
```
from __future__ import absolute_import, print_function

import tvm
from tvm import te


def intrin_ews(ro,co,data_type):
    a = te.placeholder((ro,co), dtype=data_type, name="a")
    b = te.placeholder((ro,co), dtype=data_type, name="b")
    c = te.compute((ro,co), lambda i,j: a[i,j] + b[i,j], name="c")

    # Preview a generic schedule
    #preview = te.create_schedule(c.op)
    #print(tvm.lower(preview, [a, b, c], simple_mode=True))

    # Define buffers
    # Offset factor --> optimize for vectorized buffering
    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[2,1])
    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[2,1])
    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[2,1])

    def intrin_func(ins, outs):
        # create IR builder
        ib = tvm.tir.ir_builder.create()
        aa, bb = ins
        cc = outs[0]
        ib.emit(
            tvm.tir.call_extern(
                "float32",
                "ews",
                cc.access_ptr("w"),
                aa.access_ptr("r"),
                bb.access_ptr("r"),
                ro,
                co,
                bb.strides[0],
            )
        )
        return ib.get()

    return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})


rows = 2
cols = 2
data_type = "float32"
# Create an instance
intrinsic = intrin_ews(rows,cols,data_type)

ro = 4
co = 4
# Create a tensorizable schedule
A = te.placeholder((ro,co), dtype=data_type, name="A")
B = te.placeholder((ro,co), dtype=data_type, name="B")
C = te.compute((ro,co), lambda i,j: A[i,j] + B[i,j], name="C")
# Create a vanilla schedule
s = te.create_schedule(C.op)
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1],x_factor=2,y_factor=2)
print(tvm.lower(s, [A, B, C], simple_mode=True))
# Get a handle to the axis
# x, y = s[C].op.axis
# Tensorize!
s[C].tensorize(xi, intrinsic)
print(tvm.lower(s, [A, B, C], simple_mode=True))
```

The stack trace of the error:
```
tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (8) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::NodeFunctor<tvm::tir::Stmt (tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const+0x11d) [0x7f9e3176df4d]
  [bt] (7) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#2}::_FUN(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)+0x26) [0x7f9e31766516]
  [bt] (6) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::tir::StorageFlattener::VisitStmt_(tvm::tir::AttrStmtNode const*)+0x333) [0x7f9e31d09193]
  [bt] (5) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::tir::StorageFlattener::HandleBufferBindScope(tvm::tir::AttrStmtNode const*)+0xbaf) [0x7f9e31d02f1f]
  [bt] (4) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::tir::ArgBinder::BindBuffer(tvm::tir::Buffer const&, tvm::tir::Buffer const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)+0xb6a) [0x7f9e31c3d8aa]
  [bt] (3) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::tir::ArgBinder::BindArray(tvm::runtime::Array<tvm::PrimExpr, void> const&, tvm::runtime::Array<tvm::PrimExpr, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x478) [0x7f9e31c3cb58]
  [bt] (2) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::tir::ArgBinder::Bind_(tvm::PrimExpr const&, tvm::PrimExpr const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)+0x24c) [0x7f9e31c3c06c]
  [bt] (1) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::tir::BinderAddAssert(tvm::arith::Analyzer*, tvm::PrimExpr, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<tvm::tir::Stmt, std::allocator<tvm::tir::Stmt> >*)+0xe0) [0x7f9e31c3ba90]
  [bt] (0) /home/josse/.pyenv/versions/3.7.7/envs/tvm-sirius/lib/python3.7/site-packages/tvm-0.8.dev609+g57c467fed-py3.7-linux-x86_64.egg/tvm/libtvm.so(+0xad88a6) [0x7f9e31c3b8a6]
  File "/home/josse/Thesis/tvm-fork/tvm-fork/src/tir/transforms/arg_binder.cc", line 40
TVMError: Bind have an unmet assertion: (bool)0,  on argument A.strides[0]
```

I think I'm not correctly using the "buffer_bind_scope" instruction here.
I'm not surehow I should make it work, any solutions are comments are very much appreciated! 
If someone can explain me or give me some pointers on  how the buffer bind scope works that would be great!
Thanks!





---
[Visit Topic](https://discuss.tvm.apache.org/t/te-tensorize-elementwise-sum/9335/1) to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/a3d5e28716962a6e1d9e252d9cee8d1f9b7f0e670fb06a85ceb1752e956370b1).

[Apache TVM Discuss] [Questions] [TE] Tensorize Elementwise Sum

Posted by Josse Van Delm via Apache TVM Discuss <no...@discuss.tvm.ai>.

@leeexyz I can see it now too! This is really helpful! Thank you so much!





---
[Visit Topic](https://discuss.tvm.apache.org/t/te-tensorize-elementwise-sum/9335/3) to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/355ecef0adc65a1139e1c1535231e8cd55169b58660de6396abbb46ed194b8bb).

[Apache TVM Discuss] [Questions] [TE] Tensorize Elementwise Sum

Posted by leeexyz via Apache TVM Discuss <no...@discuss.tvm.ai>.

[quote="JosseVanDelm, post:1, topic:9335"]
```
    Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[2,1])
    Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[2,1])
    Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[2,1])
```
[/quote]

Hi, you specified the wrong strides. try:
```python
Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[4, 1])
Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[4, 1])
Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[4,1])
```

The original IR is
```python
primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [4, 4], []),
             B: Buffer(B_2: Pointer(float32), float32, [4, 4], []),
             A: Buffer(A_2: Pointer(float32), float32, [4, 4], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (i.outer: int32, 0, 2) {
    for (j.outer: int32, 0, 2) {
      for (i.inner: int32, 0, 2) {
        for (j.inner: int32, 0, 2) {
          C_2[((((i.outer*8) + (i.inner*4)) + (j.outer*2)) + j.inner)] = ((float32*)A_2[((((i.outer*8) + (i.inner*4)) + (j.outer*2)) + j.inner)] + (float32*)B_2[((((i.outer*8) + (i.inner*4)) + (j.outer*2)) + j.inner)])
        }
      }
    }
  }
}
```
As you can see, the coeff of i.inner is 4, i.inner*4 is 1. These two-axis are the Tensorize region. They will be verified in the bind buffer step in StorageFlatten Pass.





---
[Visit Topic](https://discuss.tvm.apache.org/t/te-tensorize-elementwise-sum/9335/2) to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/51acb41e5a2542204e7523308d7d6000bab44447f6d99a85cc4b06a31b6eeaaa).