You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/11/05 00:14:25 UTC

[GitHub] [tvm] nverke opened a new pull request, #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

nverke opened a new pull request, #13301:
URL: https://github.com/apache/tvm/pull/13301

   …odes match_buffer statements when validating writes
   
   Previously this check did not take into account any match_buffer statements and consequently would fail for tensorized schedules. Now it takes these into account when possible. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] junrushao commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

junrushao commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1019702531


##########
tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py:
##########
@@ -487,6 +487,149 @@ def lowered_single_reduction_loop_with_block_predicate(
                     )
 
 
+@T.prim_func
+def single_reduction_loop_with_tensorize(
+    input_A: T.Buffer[(1, 64, 7, 7, 32), "uint8"],
+    input_B: T.Buffer[(16, 64, 1, 1, 8, 32, 4), "int8"],
+    output: T.Buffer[(1, 16, 7, 7, 32), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for i1, i2, i3, i4, i5 in T.grid(16, 4, 98, 2, 32):
+        with T.block("compute_o"):
+            n = T.axis.spatial(1, 0)
+            oc_chunk = T.axis.spatial(16, i1)
+            oh = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) // 3584)
+            ow = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 3584 // 512)
+            kh = T.axis.reduce(1, 0)
+            kw = T.axis.reduce(1, 0)
+            ic_outer = T.axis.reduce(64, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 512 // 8)
+            ic_f_inner = T.axis.reduce(8, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 8)
+            T.reads(
+                input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+            )
+            T.writes(output[n, oc_chunk, oh, ow, 0:32])
+            with T.init():
+                for x in T.serial(32):
+                    with T.block("compute_init"):
+                        oc_block_i_init = T.axis.spatial(32, x)
+                        T.reads()
+                        T.writes(output[n, oc_chunk, oh, ow, oc_block_i_init])
+                        output[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+            with T.block("compute_o"):
+                T.reads(
+                    output[n, oc_chunk, oh, ow, 0:32],
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                )
+                T.writes(output[n, oc_chunk, oh, ow, 0:32])
+                A = T.match_buffer(
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    [4],
+                    dtype="uint8",
+                    offset_factor=1,
+                )
+                B = T.match_buffer(
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                    [32, 4],
+                    dtype="int8",
+                    offset_factor=1,
+                )
+                C = T.match_buffer(
+                    output[n, oc_chunk, oh, ow, 0:32], [32], dtype="int32", offset_factor=1
+                )
+                A_u8x4: T.uint8x4 = A[0:4]
+                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                B_i8x128 = B[0, 0:128]
+                B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                C[0:32] = T.call_llvm_pure_intrin(
+                    4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32"
+                )
+
+
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A = T.match_buffer(
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                    A_i8x4: T.int8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[0:4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[0] = A_i32 + B_i32 + C[0]
+
+
+@T.prim_func
+def nested_reduction_loop_with_outer_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):

Review Comment:
   I'm late to the party! Just a quick note: this TVMScript is not valid TIR and I happened to detect it when using the new TVMScript parser which checks more carefully :-)
   
   More specifically, both `T.init()` and `T.axis.reduce` should be placed immediately under a TIR block, while line 611 and line 612 are not :-(
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

nverke commented on PR #13301:
URL: https://github.com/apache/tvm/pull/13301#issuecomment-1306352476

   @tvm-bot rerun 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] junrushao commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

junrushao commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1019707848


##########
tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py:
##########
@@ -487,6 +487,149 @@ def lowered_single_reduction_loop_with_block_predicate(
                     )
 
 
+@T.prim_func
+def single_reduction_loop_with_tensorize(
+    input_A: T.Buffer[(1, 64, 7, 7, 32), "uint8"],
+    input_B: T.Buffer[(16, 64, 1, 1, 8, 32, 4), "int8"],
+    output: T.Buffer[(1, 16, 7, 7, 32), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for i1, i2, i3, i4, i5 in T.grid(16, 4, 98, 2, 32):
+        with T.block("compute_o"):
+            n = T.axis.spatial(1, 0)
+            oc_chunk = T.axis.spatial(16, i1)
+            oh = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) // 3584)
+            ow = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 3584 // 512)
+            kh = T.axis.reduce(1, 0)
+            kw = T.axis.reduce(1, 0)
+            ic_outer = T.axis.reduce(64, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 512 // 8)
+            ic_f_inner = T.axis.reduce(8, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 8)
+            T.reads(
+                input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+            )
+            T.writes(output[n, oc_chunk, oh, ow, 0:32])
+            with T.init():
+                for x in T.serial(32):
+                    with T.block("compute_init"):
+                        oc_block_i_init = T.axis.spatial(32, x)
+                        T.reads()
+                        T.writes(output[n, oc_chunk, oh, ow, oc_block_i_init])
+                        output[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+            with T.block("compute_o"):
+                T.reads(
+                    output[n, oc_chunk, oh, ow, 0:32],
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                )
+                T.writes(output[n, oc_chunk, oh, ow, 0:32])
+                A = T.match_buffer(
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    [4],
+                    dtype="uint8",
+                    offset_factor=1,
+                )
+                B = T.match_buffer(
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                    [32, 4],
+                    dtype="int8",
+                    offset_factor=1,
+                )
+                C = T.match_buffer(
+                    output[n, oc_chunk, oh, ow, 0:32], [32], dtype="int32", offset_factor=1
+                )
+                A_u8x4: T.uint8x4 = A[0:4]
+                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                B_i8x128 = B[0, 0:128]
+                B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                C[0:32] = T.call_llvm_pure_intrin(
+                    4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32"
+                )
+
+
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A = T.match_buffer(
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                    A_i8x4: T.int8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[0:4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[0] = A_i32 + B_i32 + C[0]
+
+
+@T.prim_func
+def nested_reduction_loop_with_outer_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):

Review Comment:
   i would love to temporarily exclude this particular TVMScript from testing, but happy to merge it back if you have a follow-up PR to fix :-)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] tvm-bot commented on pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

tvm-bot commented on PR #13301:
URL: https://github.com/apache/tvm/pull/13301#issuecomment-1304351824

   <!---bot-comment-->
   
   Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment.
   
   <!--bot-comment-ccs-start-->
    * cc @Hzfengsy, @junrushao <sub>See [#10317](https://github.com/apache/tvm/issues/10317) for details</sub><!--bot-comment-ccs-end-->
   
   <sub>Generated by [tvm-bot](https://github.com/apache/tvm/blob/main/ci/README.md#github-actions)</sub>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

nverke commented on PR #13301:
URL: https://github.com/apache/tvm/pull/13301#issuecomment-1306088235

   > Thanks @nverke for the enhancement. Could you please add a regression test case?
   
   Just added! 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1019893402


##########
tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py:
##########
@@ -487,6 +487,149 @@ def lowered_single_reduction_loop_with_block_predicate(
                     )
 
 
+@T.prim_func
+def single_reduction_loop_with_tensorize(
+    input_A: T.Buffer[(1, 64, 7, 7, 32), "uint8"],
+    input_B: T.Buffer[(16, 64, 1, 1, 8, 32, 4), "int8"],
+    output: T.Buffer[(1, 16, 7, 7, 32), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for i1, i2, i3, i4, i5 in T.grid(16, 4, 98, 2, 32):
+        with T.block("compute_o"):
+            n = T.axis.spatial(1, 0)
+            oc_chunk = T.axis.spatial(16, i1)
+            oh = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) // 3584)
+            ow = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 3584 // 512)
+            kh = T.axis.reduce(1, 0)
+            kw = T.axis.reduce(1, 0)
+            ic_outer = T.axis.reduce(64, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 512 // 8)
+            ic_f_inner = T.axis.reduce(8, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 8)
+            T.reads(
+                input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+            )
+            T.writes(output[n, oc_chunk, oh, ow, 0:32])
+            with T.init():
+                for x in T.serial(32):
+                    with T.block("compute_init"):
+                        oc_block_i_init = T.axis.spatial(32, x)
+                        T.reads()
+                        T.writes(output[n, oc_chunk, oh, ow, oc_block_i_init])
+                        output[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+            with T.block("compute_o"):
+                T.reads(
+                    output[n, oc_chunk, oh, ow, 0:32],
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                )
+                T.writes(output[n, oc_chunk, oh, ow, 0:32])
+                A = T.match_buffer(
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    [4],
+                    dtype="uint8",
+                    offset_factor=1,
+                )
+                B = T.match_buffer(
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                    [32, 4],
+                    dtype="int8",
+                    offset_factor=1,
+                )
+                C = T.match_buffer(
+                    output[n, oc_chunk, oh, ow, 0:32], [32], dtype="int32", offset_factor=1
+                )
+                A_u8x4: T.uint8x4 = A[0:4]
+                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                B_i8x128 = B[0, 0:128]
+                B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                C[0:32] = T.call_llvm_pure_intrin(
+                    4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32"
+                )
+
+
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A = T.match_buffer(
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                    A_i8x4: T.int8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[0:4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[0] = A_i32 + B_i32 + C[0]
+
+
+@T.prim_func
+def nested_reduction_loop_with_outer_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):

Review Comment:
   Oh interesting! I see so the reduction axis is tied to the block despite being within the loop. I can follow up with a commit in a few days or you can remove the test if its causing issues. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1019893402


##########
tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py:
##########
@@ -487,6 +487,149 @@ def lowered_single_reduction_loop_with_block_predicate(
                     )
 
 
+@T.prim_func
+def single_reduction_loop_with_tensorize(
+    input_A: T.Buffer[(1, 64, 7, 7, 32), "uint8"],
+    input_B: T.Buffer[(16, 64, 1, 1, 8, 32, 4), "int8"],
+    output: T.Buffer[(1, 16, 7, 7, 32), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for i1, i2, i3, i4, i5 in T.grid(16, 4, 98, 2, 32):
+        with T.block("compute_o"):
+            n = T.axis.spatial(1, 0)
+            oc_chunk = T.axis.spatial(16, i1)
+            oh = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) // 3584)
+            ow = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 3584 // 512)
+            kh = T.axis.reduce(1, 0)
+            kw = T.axis.reduce(1, 0)
+            ic_outer = T.axis.reduce(64, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 512 // 8)
+            ic_f_inner = T.axis.reduce(8, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 8)
+            T.reads(
+                input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+            )
+            T.writes(output[n, oc_chunk, oh, ow, 0:32])
+            with T.init():
+                for x in T.serial(32):
+                    with T.block("compute_init"):
+                        oc_block_i_init = T.axis.spatial(32, x)
+                        T.reads()
+                        T.writes(output[n, oc_chunk, oh, ow, oc_block_i_init])
+                        output[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+            with T.block("compute_o"):
+                T.reads(
+                    output[n, oc_chunk, oh, ow, 0:32],
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                )
+                T.writes(output[n, oc_chunk, oh, ow, 0:32])
+                A = T.match_buffer(
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    [4],
+                    dtype="uint8",
+                    offset_factor=1,
+                )
+                B = T.match_buffer(
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                    [32, 4],
+                    dtype="int8",
+                    offset_factor=1,
+                )
+                C = T.match_buffer(
+                    output[n, oc_chunk, oh, ow, 0:32], [32], dtype="int32", offset_factor=1
+                )
+                A_u8x4: T.uint8x4 = A[0:4]
+                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                B_i8x128 = B[0, 0:128]
+                B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                C[0:32] = T.call_llvm_pure_intrin(
+                    4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32"
+                )
+
+
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A = T.match_buffer(
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                    A_i8x4: T.int8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[0:4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[0] = A_i32 + B_i32 + C[0]
+
+
+@T.prim_func
+def nested_reduction_loop_with_outer_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):

Review Comment:
   Oh interesting! Was not aware of this requirement. I can follow up with a commit in a few days or you can remove the test if its causing issues. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1021841923


##########
tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py:
##########
@@ -487,6 +487,149 @@ def lowered_single_reduction_loop_with_block_predicate(
                     )
 
 
+@T.prim_func
+def single_reduction_loop_with_tensorize(
+    input_A: T.Buffer[(1, 64, 7, 7, 32), "uint8"],
+    input_B: T.Buffer[(16, 64, 1, 1, 8, 32, 4), "int8"],
+    output: T.Buffer[(1, 16, 7, 7, 32), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for i1, i2, i3, i4, i5 in T.grid(16, 4, 98, 2, 32):
+        with T.block("compute_o"):
+            n = T.axis.spatial(1, 0)
+            oc_chunk = T.axis.spatial(16, i1)
+            oh = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) // 3584)
+            ow = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 3584 // 512)
+            kh = T.axis.reduce(1, 0)
+            kw = T.axis.reduce(1, 0)
+            ic_outer = T.axis.reduce(64, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 512 // 8)
+            ic_f_inner = T.axis.reduce(8, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 8)
+            T.reads(
+                input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+            )
+            T.writes(output[n, oc_chunk, oh, ow, 0:32])
+            with T.init():
+                for x in T.serial(32):
+                    with T.block("compute_init"):
+                        oc_block_i_init = T.axis.spatial(32, x)
+                        T.reads()
+                        T.writes(output[n, oc_chunk, oh, ow, oc_block_i_init])
+                        output[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+            with T.block("compute_o"):
+                T.reads(
+                    output[n, oc_chunk, oh, ow, 0:32],
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                )
+                T.writes(output[n, oc_chunk, oh, ow, 0:32])
+                A = T.match_buffer(
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    [4],
+                    dtype="uint8",
+                    offset_factor=1,
+                )
+                B = T.match_buffer(
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                    [32, 4],
+                    dtype="int8",
+                    offset_factor=1,
+                )
+                C = T.match_buffer(
+                    output[n, oc_chunk, oh, ow, 0:32], [32], dtype="int32", offset_factor=1
+                )
+                A_u8x4: T.uint8x4 = A[0:4]
+                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                B_i8x128 = B[0, 0:128]
+                B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                C[0:32] = T.call_llvm_pure_intrin(
+                    4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32"
+                )
+
+
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A = T.match_buffer(
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                    A_i8x4: T.int8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[0:4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[0] = A_i32 + B_i32 + C[0]
+
+
+@T.prim_func
+def nested_reduction_loop_with_outer_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):

Review Comment:
   https://github.com/apache/tvm/pull/13373/files



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] vinx13 commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

vinx13 commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1018312684


##########
src/tir/schedule/analysis/reducer.cc:
##########
@@ -572,9 +572,25 @@ bool ReductionIterNotIndexOutputBuffer(const Block& block) {
     if (!store) {
       return true;
     }
-    ICHECK(buffer_written.count(store->buffer.get()))
-        << "ValueError: The buffer \"" << store->buffer
-        << "\" is written in the block but is not in the block's signature";
+    const auto* body_block = block->body.as<BlockRealizeNode>();

Review Comment:
   checking block->body is not sufficient. It is possible the inner block has outer loops. 
   consider the case:
   ```
   block1
     for …
        block2
           match_buffer
           buffer_store
   ```
   
   In this case, we need to the parent block of the store statement 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] Hzfengsy commented on pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

Hzfengsy commented on PR #13301:
URL: https://github.com/apache/tvm/pull/13301#issuecomment-1304430330

   Thanks @nverke for the enhancement. Could you please add a regression test case?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1018457482


##########
src/tir/schedule/analysis/reducer.cc:
##########
@@ -572,9 +572,25 @@ bool ReductionIterNotIndexOutputBuffer(const Block& block) {
     if (!store) {
       return true;
     }
-    ICHECK(buffer_written.count(store->buffer.get()))
-        << "ValueError: The buffer \"" << store->buffer
-        << "\" is written in the block but is not in the block's signature";
+    const auto* body_block = block->body.as<BlockRealizeNode>();

Review Comment:
   Alright made changes accordingly and added a test to validate this case as well as another test that on the other scenario even though that was never an issue.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] tmoreau89 merged pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

tmoreau89 merged PR #13301:
URL: https://github.com/apache/tvm/pull/13301


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] vinx13 commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

vinx13 commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1018387788


##########
src/tir/schedule/analysis/reducer.cc:
##########
@@ -572,9 +572,25 @@ bool ReductionIterNotIndexOutputBuffer(const Block& block) {
     if (!store) {
       return true;
     }
-    ICHECK(buffer_written.count(store->buffer.get()))
-        << "ValueError: The buffer \"" << store->buffer
-        << "\" is written in the block but is not in the block's signature";
+    const auto* body_block = block->body.as<BlockRealizeNode>();

Review Comment:
   I'm referring to your former example. In this case, `if (body_block)` will always be false, and `match_buffer` will not be checked.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] junrushao commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

junrushao commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1019703907


##########
tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py:
##########
@@ -487,6 +487,149 @@ def lowered_single_reduction_loop_with_block_predicate(
                     )
 
 
+@T.prim_func
+def single_reduction_loop_with_tensorize(
+    input_A: T.Buffer[(1, 64, 7, 7, 32), "uint8"],
+    input_B: T.Buffer[(16, 64, 1, 1, 8, 32, 4), "int8"],
+    output: T.Buffer[(1, 16, 7, 7, 32), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for i1, i2, i3, i4, i5 in T.grid(16, 4, 98, 2, 32):
+        with T.block("compute_o"):
+            n = T.axis.spatial(1, 0)
+            oc_chunk = T.axis.spatial(16, i1)
+            oh = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) // 3584)
+            ow = T.axis.spatial(7, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 3584 // 512)
+            kh = T.axis.reduce(1, 0)
+            kw = T.axis.reduce(1, 0)
+            ic_outer = T.axis.reduce(64, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 512 // 8)
+            ic_f_inner = T.axis.reduce(8, (i2 * 6272 + i3 * 64 + i4 * 32 + i5) % 8)
+            T.reads(
+                input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+            )
+            T.writes(output[n, oc_chunk, oh, ow, 0:32])
+            with T.init():
+                for x in T.serial(32):
+                    with T.block("compute_init"):
+                        oc_block_i_init = T.axis.spatial(32, x)
+                        T.reads()
+                        T.writes(output[n, oc_chunk, oh, ow, oc_block_i_init])
+                        output[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+            with T.block("compute_o"):
+                T.reads(
+                    output[n, oc_chunk, oh, ow, 0:32],
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                )
+                T.writes(output[n, oc_chunk, oh, ow, 0:32])
+                A = T.match_buffer(
+                    input_A[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    [4],
+                    dtype="uint8",
+                    offset_factor=1,
+                )
+                B = T.match_buffer(
+                    input_B[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:32, 0:4],
+                    [32, 4],
+                    dtype="int8",
+                    offset_factor=1,
+                )
+                C = T.match_buffer(
+                    output[n, oc_chunk, oh, ow, 0:32], [32], dtype="int32", offset_factor=1
+                )
+                A_u8x4: T.uint8x4 = A[0:4]
+                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                B_i8x128 = B[0, 0:128]
+                B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                C[0:32] = T.call_llvm_pure_intrin(
+                    4217, T.uint32(3), C[0:32], T.broadcast(A_i32, 32), B_i32x32, dtype="int32x32"
+                )
+
+
+@T.prim_func
+def nested_reduction_loop_with_inner_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):
+                    T.reads(
+                        out[yi, xr],
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4],
+                    )
+                    T.writes(out[yi, xr])
+                    A = T.match_buffer(
+                        in0[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    B = T.match_buffer(
+                        in1[yi, yi * 4 + xr : yi * 4 + xr + 4], [4], dtype="int8", offset_factor=1
+                    )
+                    C = T.match_buffer(out[yi, xr], [1], dtype="int32", offset_factor=1)
+                    A_i8x4: T.int8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
+                    B_i8x4: T.int8x4 = B[0:4]
+                    B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
+                    C[0] = A_i32 + B_i32 + C[0]
+
+
+@T.prim_func
+def nested_reduction_loop_with_outer_match_buffers(
+    in0: T.Buffer[(4, 16), "int8"],
+    in1: T.Buffer[(4, 16), "int8"],
+    out: T.Buffer[(4, 4), "int32"],
+) -> None:
+    # body
+    # with T.block("root")
+    for y in T.serial(4):
+        with T.block("C"):
+            yi = T.axis.spatial(4, y)
+            T.reads(in0[yi, 0:16], in1[yi, 0:16])
+            T.writes(out[yi, 0:4])
+            A = T.match_buffer(in0[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            B = T.match_buffer(in1[yi, 0:16], [16], dtype="int8", offset_factor=1)
+            C = T.match_buffer(out[yi, 0:4], [4], dtype="int32", offset_factor=1)
+            for x in T.serial(4):
+                xr = T.axis.reduce(4, x)
+                with T.init():
+                    for i in T.serial(4):
+                        with T.block("C_init"):
+                            ii = T.axis.spatial(4, i)
+                            T.reads()
+                            T.writes(out[yi, ii])
+                            out[yi, ii] = 0
+                with T.block("C"):

Review Comment:
   If we print out the testcase using `nested_reduction_loop_with_outer_match_buffers.show()`, then the TIR looks like:
   
   ```python
   # from tvm.script import tir as T
   @T.prim_func
   def func(in0: T.Buffer[(4, 16), "int8"], in1: T.Buffer[(4, 16), "int8"], out: T.Buffer[(4, 4), "int32"]):
       # body
       # with T.block("root")
       for y in T.serial(4):
           with T.block("C"):
               yi = T.axis.spatial(4, y)
               xr = T.axis.reduce(4, x)
               T.reads(in0[yi, 0 : 16], in1[yi, 0 : 16])
               T.writes(out[yi, 0 : 4])
               A = T.match_buffer(in0[yi, 0 : 16], [16], dtype="int8", offset_factor=1)
               B = T.match_buffer(in1[yi, 0 : 16], [16], dtype="int8", offset_factor=1)
               C = T.match_buffer(out[yi, 0 : 4], [4], dtype="int32", offset_factor=1)
               with T.init():
                   for i in T.serial(4):
                       with T.block("C_init"):
                           ii = T.axis.spatial(4, i)
                           T.reads()
                           T.writes(out[yi, ii])
                           out[yi, ii] = 0
               for x in T.serial(4):
                   with T.block("C"):
                       T.reads(out[yi, xr], in0[yi, yi * 4 + xr : yi * 4 + xr + 4], in1[yi, yi * 4 + xr : yi * 4 + xr + 4])
                       T.writes(out[yi, xr])
                       A_i8x4: T.int8x4 = A[yi * 4 + xr:yi * 4 + xr + 4]
                       A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
                       B_i8x4: T.int8x4 = B[yi * 4 + xr:yi * 4 + xr + 4]
                       B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
                       C[xr] = A_i32 + B_i32 + C[xr]
   ```
   
   where we may see some use-before-def issues



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13301:
URL: https://github.com/apache/tvm/pull/13301#discussion_r1018349185


##########
src/tir/schedule/analysis/reducer.cc:
##########
@@ -572,9 +572,25 @@ bool ReductionIterNotIndexOutputBuffer(const Block& block) {
     if (!store) {
       return true;
     }
-    ICHECK(buffer_written.count(store->buffer.get()))
-        << "ValueError: The buffer \"" << store->buffer
-        << "\" is written in the block but is not in the block's signature";
+    const auto* body_block = block->body.as<BlockRealizeNode>();

Review Comment:
   Hmm I am not sure if I understand. Are you talking about a situation like this? 
   ```
   @T.prim_func
   def nested_reduction_loop_with_match_buffers(
       in0: T.Buffer[(4, 4, 4), "int8"],
       in1: T.Buffer[(4, 4, 4), "int8"],
       out: T.Buffer[(4, 4, 4), "int8"],
   ) -> None:
       # body
       # with T.block("root")
       for y in T.serial(4):
           with T.block("C"):
               T.reads(in0[y, 0:4, 0:4], in1[y, 0:4, 0:4])
               T.writes(out[y, 0:4, 0:4])
               for x in T.serial(4):
                   with T.block("C"):
                       T.reads(in0[y, x, 0:4], in1[y, x, 0:4])
                       T.writes(out[y, x, 0:4])
                       A = T.match_buffer(in0[y, x, 0:4], [4], dtype="int8", offset_factor=1)
                       B = T.match_buffer(in1[y, x, 0:4], [4], dtype="int8", offset_factor=1)
                       C = T.match_buffer(out[y, x, 0:4], [4], dtype="int8", offset_factor=1)
                       A_i8x4: T.int8x4 = A[0:4]
                       A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
                       B_i8x4: T.int8x4 = B[0:4]
                       B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
                       C[0:4] = T.reinterpret(A_i32 + B_i32, dtype="int8x4")
   ```
   My understanding is that this check is on the level of the store statement and already has collected all of the write regions for the parent loops so just adding the regions that the match buffers check should be enough. 
   
   Alternatively are you referring to something like this? 
   ```
   @T.prim_func
   def nested_reduction_loop_with_match_buffers(
       in0: T.Buffer[(4, 4, 4), "int8"],
       in1: T.Buffer[(4, 4, 4), "int8"],
       out: T.Buffer[(4, 4, 4), "int8"],
   ) -> None:
       # body
       # with T.block("root")
       for y in T.serial(4):
           with T.block("C"):
               T.reads(in0[y, 0:4, 0:4], in1[y, 0:4, 0:4])
               T.writes(out[y, 0:4, 0:4])
               A = T.match_buffer(in0[y, 0:4, 0:4], [4, 4], dtype="int8", offset_factor=1)
               B = T.match_buffer(in1[y, 0:4, 0:4], [4, 4], dtype="int8", offset_factor=1)
               C = T.match_buffer(out[y, 0:4, 0:4], [4, 4], dtype="int8", offset_factor=1)
               for x in T.serial(4):
                   with T.block("C"):
                       T.reads(in0[y, x, 0:4], in1[y, x, 0:4])
                       T.writes(out[y, x, 0:4])
                       A_i8x4: T.int8x4 = A[x, 0:4]
                       A_i32: T.int32 = T.reinterpret(A_i8x4, dtype="int32")
                       B_i8x4: T.int8x4 = B[x, 0:4]
                       B_i32: T.int32 = T.reinterpret(B_i8x4, dtype="int32")
                       C[x, 0:4] = T.reinterpret(A_i32 + B_i32, dtype="int8x4")
   ```
   Here I believe we are still able to pickup the match buffers from the body block. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] tmoreau89 commented on pull request #13301: [TIR] Update ReductionIterNotIndexOutputBuffer to check BlockRealizeN…

Posted by GitBox <gi...@apache.org>.

tmoreau89 commented on PR #13301:
URL: https://github.com/apache/tvm/pull/13301#issuecomment-1310613761

   Thank you @nverke @vinx13 - PR has been merged!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org