You are viewing a plain text version of this content. The canonical link for it is here.
Posted to discuss-archive@tvm.apache.org by chenugray via Apache TVM Discuss <no...@discuss.tvm.ai> on 2022/03/16 03:07:39 UTC

[Apache TVM Discuss] [Questions] [BUG Report] auto dense large gpu schedule


Take this code for example:

    import numpy as np
    import tvm
    from tvm.autotvm.tuner import XGBTuner
    from tvm import relay, autotvm
    import pytest
    def test_dense_autotvm():
        target = tvm.target.cuda()
        batch, in_dim, out_dim = 16384, 768, 768
        data_shape = (batch, in_dim)
        weight_shape = (out_dim, in_dim)
        data = relay.var("data", shape=data_shape, dtype="float16")
        weight = relay.var("weight", shape=weight_shape, dtype="float16")
        dense_val = relay.nn.dense(data, weight, out_dtype="float32")
        func = relay.Function(relay.analysis.free_vars(dense_val), dense_val)
        mod = tvm.IRModule()
        mod['main'] = func
        log_filename = "dense_autotvm.log"
        tmp_logfile = "dense_autotvm.log" + ".tmp"
        measure_option = autotvm.measure_option(
            builder=autotvm.LocalBuilder(timeout=10, n_parallel=1),
            runner=autotvm.LocalRunner(
                number=1, repeat=2, timeout=10, min_repeat_ms=100),
        )
        tasks = autotvm.task.extract_from_program(
            func, target=target, params=None, ops=None)
        tsk = tasks[2]
        tuner_obj = XGBTuner(tsk, loss_type="rank")
        tuner_obj.tune(n_trial=10, early_stopping=0, measure_option=measure_option,
                    callbacks=[
                        autotvm.callback.progress_bar(10, ),
                        autotvm.callback.log_to_file(tmp_logfile),
                    ])

when run this program, `pytest -s test_my_dense.py`, the erorr may be seen like:

    [17:52:54] .../verify_gpu_code.cc:298: VerifyGPUCode err: Extent of threadIdx.y (1) does not match the bound 16
    [17:52:54] .../verify_gpu_code.cc:298: VerifyGPUCode err: Extent of threadIdx.x (16) does not match the bound 1
    [17:52:54] .../verify_gpu_code.cc:298: VerifyGPUCode err: Used shared memory per block (2146304) is greater than the allowed maximum (49152)

test device should be in T4.
![image|668x365](upload://gxMKzqkayWOSkJUGCS41r9FpYQF.png) 

print the llvm ir and you will see the log like below, to make the ir more concise, i comment the unroll and double buffer.

    [17:52:54] /home/qqqqq/source_code/tvm/src/tir/analysis/verify_gpu_code.cc:298: VerifyGPUCode err: Used shared memory per block (1609728) is greater than the allowed maximum (49152)
     Current/Best:    0.00/   0.00 GFLOPS | Progress: (9/10) | 2.76 s2 @main = primfn(placeholder_2: handle, placeholder_3: handle, T_matmul_NT_1: handle) -> ()
      attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
      buffers = {T_matmul_NT: Buffer(T_matmul_NT_2: Pointer(float32), float32, [12582912], []),
                 placeholder_1: Buffer(placeholder_4: Pointer(float16), float16, [589824], []),
                 placeholder: Buffer(placeholder_5: Pointer(float16), float16, [12582912], [])}
      buffer_map = {placeholder_2: placeholder, placeholder_3: placeholder_1, T_matmul_NT_1: T_matmul_NT} {
      attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 4;
      allocate(T_matmul_NT.local: Pointer(local float32), float32, [98304]), storage_scope = local;
      allocate(placeholder.shared: Pointer(shared float16), float16, [1048576]), storage_scope = shared;
      allocate(placeholder.d.shared: Pointer(shared float16), float16, [24576]), storage_scope = shared;
      allocate(placeholder.shared.local: Pointer(local float16), float16, [131072]), storage_scope = local;
      allocate(placeholder.d.shared.local: Pointer(local float16), float16, [192]), storage_scope = local;
      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 2;
      attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 16;
      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 {
        for (i.c.init: int32, 0, 8) {
          for (j.c.init: int32, 0, 3) {
            for (vthread.s: int32, 0, 1024) {
              let cse_var_1: int32 = (((vthread.s*24) + (i.c.init*3)) + j.c.init)
               {
                T_matmul_NT.local_1: Buffer(T_matmul_NT.local, float32, [14155776], [], scope="local", align=64)[cse_var_1] = 0f32
                T_matmul_NT.local_1[(cse_var_1 + 24576)] = 0f32
                T_matmul_NT.local_1[(cse_var_1 + 49152)] = 0f32
                T_matmul_NT.local_1[(cse_var_1 + 73728)] = 0f32
              }
            }
          }
        }
        for (k.outer: int32, 0, 6) {
          attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
          for (ax0.inner: int32, 0, 8192) {
            for (ax1.outer: int32, 0, 32) {
              attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
              for (ax1.inner.inner: int32, 0, 4) {
                let cse_var_2: int32 = (ax1.outer*4)
                placeholder.shared_1: Buffer(placeholder.shared, float16, [1048576], [], scope="shared")[(((ax0.inner*128) + cse_var_2) + ax1.inner.inner)] = placeholder[(((((blockIdx.x*6291456) + (ax0.inner*768)) + (k.outer*128)) + cse_var_2) + ax1.inner.inner)]
              }
            }
          }
          attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 16;
          for (ax0.inner_1: int32, 0, 12) {
            for (ax1.outer_1: int32, 0, 2) {
              attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
              for (ax1.inner.inner_1: int32, 0, 4) {
                let cse_var_3: int32 = (ax1.outer_1*64)
                placeholder.d.shared_1: Buffer(placeholder.d.shared, float16, [24576], [], scope="shared")[(((((threadIdx.y_2*1536) + (ax0.inner_1*128)) + cse_var_3) + (threadIdx.x_2*4)) + ax1.inner.inner_1)] = placeholder_1[(((((((blockIdx.y*147456) + (threadIdx.y_2*9216)) + (ax0.inner_1*768)) + (k.outer*128)) + cse_var_3) + (threadIdx.x_2*4)) + ax1.inner.inner_1)]
              }
            }
          }
          for (k.inner.outer: int32, 0, 8) {
            for (ax0: int32, 0, 8) {
              for (ax1: int32, 0, 16) {
                for (vthread.s_1: int32, 0, 1024) {
                  placeholder.shared.local_1: Buffer(placeholder.shared.local, float16, [16384], [], scope="local")[(((vthread.s_1*128) + (ax0*16)) + ax1)] = placeholder.shared_1[((((vthread.s_1*1024) + (ax0*128)) + (k.inner.outer*16)) + ax1)]
                }
              }
            }
            for (ax0_1: int32, 0, 3) {
              for (ax1_1: int32, 0, 16) {
                let cse_var_4: int32 = ((ax0_1*16) + ax1_1)
                 {
                  placeholder.d.shared.local_1: Buffer(placeholder.d.shared.local, float16, [2304], [], scope="local", align=64)[cse_var_4] = placeholder.d.shared_1[((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1)]
                  placeholder.d.shared.local_1[(cse_var_4 + 48)] = placeholder.d.shared_1[(((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1) + 6144)]
                  placeholder.d.shared.local_1[(cse_var_4 + 96)] = placeholder.d.shared_1[(((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1) + 12288)]
                  placeholder.d.shared.local_1[(cse_var_4 + 144)] = placeholder.d.shared_1[(((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1) + 18432)]
                }
              }
            }
            for (k.inner.inner: int32, 0, 16) {
              for (i.c: int32, 0, 8) {
                for (j.c: int32, 0, 3) {
                  for (vthread.s_2: int32, 0, 1024) {
                    let cse_var_10: int32 = ((j.c*16) + k.inner.inner)
                    let cse_var_9: int32 = (((vthread.s_2*24) + (i.c*3)) + j.c)
                    let cse_var_8: int32 = (((vthread.s_2*128) + (i.c*16)) + k.inner.inner)
                    let cse_var_7: int32 = (cse_var_9 + 24576)
                    let cse_var_6: int32 = (cse_var_9 + 49152)
                    let cse_var_5: int32 = (cse_var_9 + 73728)
                     {
                      T_matmul_NT.local_1[cse_var_9] = (T_matmul_NT.local_1[cse_var_9] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[cse_var_10])))
                      T_matmul_NT.local_1[cse_var_7] = (T_matmul_NT.local_1[cse_var_7] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[(cse_var_10 + 48)])))
                      T_matmul_NT.local_1[cse_var_6] = (T_matmul_NT.local_1[cse_var_6] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[(cse_var_10 + 96)])))
                      T_matmul_NT.local_1[cse_var_5] = (T_matmul_NT.local_1[cse_var_5] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[(cse_var_10 + 144)])))
                    }
                  }
                }
              }
            }
          }
        }
        for (j.inner.inner.inner: int32, 0, 3) {
          for (i.inner.inner.inner: int32, 0, 8) {
            for (vthread.s_3: int32, 0, 1024) {
              let cse_var_11: int32 = (((vthread.s_3*24) + (i.inner.inner.inner*3)) + j.inner.inner.inner)
               {
                T_matmul_NT[((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner)] = T_matmul_NT.local_1[cse_var_11]
                T_matmul_NT[(((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner) + 48)] = T_matmul_NT.local_1[(cse_var_11 + 24576)]
                T_matmul_NT[(((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner) + 96)] = T_matmul_NT.local_1[(cse_var_11 + 49152)]
                T_matmul_NT[(((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner) + 144)] = T_matmul_NT.local_1[(cse_var_11 + 73728)]
              }
            }
          }
        }
      }
    }


so move data and weight from global memory to shared memory,  the strange tx, ty (1, 1) and (16, 16)
might be some strange

@masahi @tqchen





---
[Visit Topic](https://discuss.tvm.apache.org/t/bug-report-auto-dense-large-gpu-schedule/12320/1) to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/d96111978ef2b4c4f1de9ef7c3c87c95d040056f908a37249820de1c67549202).