You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/06/04 10:07:36 UTC
[GitHub] [tvm] kirliavc opened a new issue, #11572: [Bug] [VTA] Cannot build GEMM Tutorial code after rescheduling

kirliavc opened a new issue, #11572:
URL: https://github.com/apache/tvm/issues/11572

   I'm running VTA GEMM tutorial code https://github.com/apache/tvm/blob/main/vta/tutorials/matrix_multiply.py. After I change the schedule and parameters of the original code, it failed to build.
   
   The original code in matrix_multiply.py uses ```m=16, n=16, o=1```. I changed into ```m=4, n=4, o=4```. I also changed
   ```
   s[A_buf].compute_at(s[C_buf], ko)
   s[B_buf].compute_at(s[C_buf], ko)
   ```
   into 
   ```
   s[A_buf].compute_at(s[C_buf], s[C_buf].op.axis[1])
   s[B_buf].compute_at(s[C_buf], s[C_buf].op.axis[1])
   ```
   So the time to load A and B buffer should change into the lower-level loop nest after changing the schedule. After building attempt, I find that the lowering step finished successfully, and gets the following IR.
   ```
   @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
     attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
     buffers = {A: Buffer(A_2: Pointer(int8), int8, [256], []),
                B: Buffer(B_2: Pointer(int8), int8, [4096], []),
                C: Buffer(C_2: Pointer(int8), int8, [256], [])}
     buffer_map = {A_1: A, B_1: B, C_1: C} {
     @tir.call_extern("VTASetDebugMode", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 1, dtype=int32)
     attr [IterVar(vta: int32, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
       attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp" {
         @tir.call_extern("VTAUopLoopBegin", 4, 4, 0, 0, dtype=int32)
         @tir.call_extern("VTAUopLoopBegin", 4, 1, 0, 0, dtype=int32)
         @tir.vta.uop_push(0, 1, 0, 0, 0, 0, 0, 0, dtype=int32)
         @tir.call_extern("VTAUopLoopEnd", dtype=int32)
         @tir.call_extern("VTAUopLoopEnd", dtype=int32)
       }
       @tir.vta.coproc_dep_push(2, 1, dtype=int32)
     }
     for (ko: int32, 0, 4) {
       for (bo: int32, 0, 4) {
         for (co: int32, 0, 4) {
           let cse_var_1: int32 = (bo*4)
            {
             attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 1 {
               @tir.vta.coproc_dep_pop(2, 1, dtype=int32)
               @tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), A_2, (cse_var_1 + ko), 1, 1, 1, 0, 0, 0, 0, 0, 2, dtype=int32)
               @tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), B_2, ((co*4) + ko), 1, 1, 1, 0, 0, 0, 0, 0, 1, dtype=int32)
               @tir.vta.coproc_dep_push(1, 2, dtype=int32)
             }
             attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 {
               @tir.vta.coproc_dep_pop(1, 2, dtype=int32)
               attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp";
               @tir.vta.uop_push(0, 0, (cse_var_1 + co), 0, 0, 0, 0, 0, dtype=int32)
               @tir.vta.coproc_dep_push(2, 1, dtype=int32)
             }
           }
         }
       }
     }
     @tir.vta.coproc_dep_push(2, 3, dtype=int32)
     @tir.vta.coproc_dep_pop(2, 1, dtype=int32)
     attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 3 {
       @tir.vta.coproc_dep_pop(2, 3, dtype=int32)
       @tir.call_extern("VTAStoreBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 0, 4, C_2, 0, 16, 1, 16, dtype=int32)
     }
     @tir.vta.coproc_sync(, dtype=int32)
   }
   ```
   ### Expected behavior
   
   It should build successfully to get the schedule and simulate with Chisel simulator
   
   ### Actual behavior
   
   It failed to build, and here is the output message.
   ```
   Traceback (most recent call last):
     File "/home/GROUPS/jlc/tvm/vta/tutorials/matmul_v2.py", line 459, in <module>
       f(A_nd, B_nd, C_nd)
     File "/home/GROUPS/jlc/tvm/python/tvm/runtime/module.py", line 178, in __call__
       return self.entry_func(*args)
     File "/home/GROUPS/jlc/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
       raise get_last_ffi_error()
   tvm._ffi.base.TVMError: Traceback (most recent call last):
     8: TVMFuncCall
     7: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
     6: tvm::runtime::LocalSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)
     5: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::WrapPackedFunc(int (*)(TVMValue*, int*, int, TVMValue*, int*, void*), tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
     4: my_gemm
     3: my_gemm_compute_
     2: VTAPushGEMMOp
     1: vta::CommandQueue::PushGEMMOp(void**, int (*)(void*), void*, int)
     0: vta::UopKernelMap::Get(void*, int) [clone .part.0]
     File "/home/GROUPS/jlc/tvm/vta/runtime/runtime.cc", line 565
   TVMError: Check failed: (nbytes == 0 || nbytes == sizeof(int)) is false:
   ```
   
   
   ### Steps to reproduce
   Run this python code to reproduce
   ```
   from __future__ import absolute_import, print_function
   import os
   import tvm
   from tvm import te
   import vta
   import numpy as np
   from tvm import rpc
   from tvm.contrib import utils
   from vta.testing import simulator
   env = vta.get_env()
   host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
   port = int(os.environ.get("VTA_RPC_PORT", "9091"))
   if env.TARGET == "pynq" or env.TARGET == "de10nano":
       assert tvm.runtime.enabled("rpc")
       remote = rpc.connect(host, port)
       vta.reconfig_runtime(remote)
       vta.program_fpga(remote, bitstream=None)
   elif env.TARGET in ["sim", "tsim"]:
       remote = rpc.LocalSession()
   m = 4
   n = 4
   o = 4
   A = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
   print(A)
   B = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
   print(B)
   A_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf")
   B_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf")
   ko = te.reduce_axis((0, n), name="ko")
   ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
   C_buf = te.compute(
       (o, m, env.BATCH, env.BLOCK_OUT),
       lambda bo, co, bi, ci: te.sum(
           A_buf[bo, ko, bi, ki].astype(env.acc_dtype) * B_buf[co, ko, ci, ki].astype(env.acc_dtype),
           axis=[ko, ki],
       ),
       name="C_buf",
   )
   C = te.compute(
       (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C"
   )
   s = te.create_schedule(C.op)
   s[A_buf].set_scope(env.inp_scope)
   s[B_buf].set_scope(env.wgt_scope)
   s[C_buf].set_scope(env.acc_scope)
   bo, co, bi, ci = s[C_buf].op.axis
   s[A_buf].compute_at(s[C_buf], co)
   s[B_buf].compute_at(s[C_buf], co)
   s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
   s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
   s[C].pragma(s[C].op.axis[0], env.dma_copy)
   s[C_buf].reorder(
       ko, bo, co, bi, ci, ki
   )
   print(s[C_buf].op.axis)
   s[C_buf].tensorize(bi, env.gemm)
   with vta.build_config(debug_flag = (1<<1)):
       print(vta.lower(s, [A, B, C], simple_mode=True))
   my_gemm = vta.build(
       s, [A, B, C], tvm.target.Target("ext_dev", host=env.target_host), name="my_gemm"
   )
   temp = utils.tempdir()
   my_gemm.save(temp.relpath("gemm.o"))
   remote.upload(temp.relpath("gemm.o"))
   f = remote.load_module("gemm.o")
   ctx = remote.ext_dev(0)
   A_orig = np.random.randint(-128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
   B_orig = np.random.randint(-128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)
   A_packed = A_orig.reshape(o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
   B_packed = B_orig.reshape(m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
   A_nd = tvm.nd.array(A_packed, ctx)
   B_nd = tvm.nd.array(B_packed, ctx)
   C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)
   if env.TARGET in ["sim", "tsim"]:
       simulator.clear_stats()
   f(A_nd, B_nd, C_nd)
   C_ref = np.dot(A_orig.astype(env.acc_dtype), B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
   C_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
   np.testing.assert_equal(C_ref, C_nd.numpy())
   if env.TARGET in ["sim", "tsim"]:
       sim_stats = simulator.stats()
       print("Execution statistics:")
       for k, v in sim_stats.items():
           print("\t{:<16}: {:>16}".format(k, v))
   print("Successful matrix multiply test!")
   ```
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org