You are viewing a plain text version of this content. The canonical link for it is here.
Posted to discuss-archive@tvm.apache.org by chenugray via Apache TVM Discuss <no...@discuss.tvm.ai> on 2022/03/16 03:07:39 UTC
[Apache TVM Discuss] [Questions] [BUG Report] auto dense large gpu schedule
Take this code for example:
import numpy as np
import tvm
from tvm.autotvm.tuner import XGBTuner
from tvm import relay, autotvm
import pytest
def test_dense_autotvm():
target = tvm.target.cuda()
batch, in_dim, out_dim = 16384, 768, 768
data_shape = (batch, in_dim)
weight_shape = (out_dim, in_dim)
data = relay.var("data", shape=data_shape, dtype="float16")
weight = relay.var("weight", shape=weight_shape, dtype="float16")
dense_val = relay.nn.dense(data, weight, out_dtype="float32")
func = relay.Function(relay.analysis.free_vars(dense_val), dense_val)
mod = tvm.IRModule()
mod['main'] = func
log_filename = "dense_autotvm.log"
tmp_logfile = "dense_autotvm.log" + ".tmp"
measure_option = autotvm.measure_option(
builder=autotvm.LocalBuilder(timeout=10, n_parallel=1),
runner=autotvm.LocalRunner(
number=1, repeat=2, timeout=10, min_repeat_ms=100),
)
tasks = autotvm.task.extract_from_program(
func, target=target, params=None, ops=None)
tsk = tasks[2]
tuner_obj = XGBTuner(tsk, loss_type="rank")
tuner_obj.tune(n_trial=10, early_stopping=0, measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(10, ),
autotvm.callback.log_to_file(tmp_logfile),
])
when run this program, `pytest -s test_my_dense.py`, the erorr may be seen like:
[17:52:54] .../verify_gpu_code.cc:298: VerifyGPUCode err: Extent of threadIdx.y (1) does not match the bound 16
[17:52:54] .../verify_gpu_code.cc:298: VerifyGPUCode err: Extent of threadIdx.x (16) does not match the bound 1
[17:52:54] .../verify_gpu_code.cc:298: VerifyGPUCode err: Used shared memory per block (2146304) is greater than the allowed maximum (49152)
test device should be in T4.
![image|668x365](upload://gxMKzqkayWOSkJUGCS41r9FpYQF.png)
print the llvm ir and you will see the log like below, to make the ir more concise, i comment the unroll and double buffer.
[17:52:54] /home/qqqqq/source_code/tvm/src/tir/analysis/verify_gpu_code.cc:298: VerifyGPUCode err: Used shared memory per block (1609728) is greater than the allowed maximum (49152)
Current/Best: 0.00/ 0.00 GFLOPS | Progress: (9/10) | 2.76 s2 @main = primfn(placeholder_2: handle, placeholder_3: handle, T_matmul_NT_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {T_matmul_NT: Buffer(T_matmul_NT_2: Pointer(float32), float32, [12582912], []),
placeholder_1: Buffer(placeholder_4: Pointer(float16), float16, [589824], []),
placeholder: Buffer(placeholder_5: Pointer(float16), float16, [12582912], [])}
buffer_map = {placeholder_2: placeholder, placeholder_3: placeholder_1, T_matmul_NT_1: T_matmul_NT} {
attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 4;
allocate(T_matmul_NT.local: Pointer(local float32), float32, [98304]), storage_scope = local;
allocate(placeholder.shared: Pointer(shared float16), float16, [1048576]), storage_scope = shared;
allocate(placeholder.d.shared: Pointer(shared float16), float16, [24576]), storage_scope = shared;
allocate(placeholder.shared.local: Pointer(local float16), float16, [131072]), storage_scope = local;
allocate(placeholder.d.shared.local: Pointer(local float16), float16, [192]), storage_scope = local;
attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 2;
attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 16;
attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 {
for (i.c.init: int32, 0, 8) {
for (j.c.init: int32, 0, 3) {
for (vthread.s: int32, 0, 1024) {
let cse_var_1: int32 = (((vthread.s*24) + (i.c.init*3)) + j.c.init)
{
T_matmul_NT.local_1: Buffer(T_matmul_NT.local, float32, [14155776], [], scope="local", align=64)[cse_var_1] = 0f32
T_matmul_NT.local_1[(cse_var_1 + 24576)] = 0f32
T_matmul_NT.local_1[(cse_var_1 + 49152)] = 0f32
T_matmul_NT.local_1[(cse_var_1 + 73728)] = 0f32
}
}
}
}
for (k.outer: int32, 0, 6) {
attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1;
for (ax0.inner: int32, 0, 8192) {
for (ax1.outer: int32, 0, 32) {
attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1;
for (ax1.inner.inner: int32, 0, 4) {
let cse_var_2: int32 = (ax1.outer*4)
placeholder.shared_1: Buffer(placeholder.shared, float16, [1048576], [], scope="shared")[(((ax0.inner*128) + cse_var_2) + ax1.inner.inner)] = placeholder[(((((blockIdx.x*6291456) + (ax0.inner*768)) + (k.outer*128)) + cse_var_2) + ax1.inner.inner)]
}
}
}
attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 16;
for (ax0.inner_1: int32, 0, 12) {
for (ax1.outer_1: int32, 0, 2) {
attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 16;
for (ax1.inner.inner_1: int32, 0, 4) {
let cse_var_3: int32 = (ax1.outer_1*64)
placeholder.d.shared_1: Buffer(placeholder.d.shared, float16, [24576], [], scope="shared")[(((((threadIdx.y_2*1536) + (ax0.inner_1*128)) + cse_var_3) + (threadIdx.x_2*4)) + ax1.inner.inner_1)] = placeholder_1[(((((((blockIdx.y*147456) + (threadIdx.y_2*9216)) + (ax0.inner_1*768)) + (k.outer*128)) + cse_var_3) + (threadIdx.x_2*4)) + ax1.inner.inner_1)]
}
}
}
for (k.inner.outer: int32, 0, 8) {
for (ax0: int32, 0, 8) {
for (ax1: int32, 0, 16) {
for (vthread.s_1: int32, 0, 1024) {
placeholder.shared.local_1: Buffer(placeholder.shared.local, float16, [16384], [], scope="local")[(((vthread.s_1*128) + (ax0*16)) + ax1)] = placeholder.shared_1[((((vthread.s_1*1024) + (ax0*128)) + (k.inner.outer*16)) + ax1)]
}
}
}
for (ax0_1: int32, 0, 3) {
for (ax1_1: int32, 0, 16) {
let cse_var_4: int32 = ((ax0_1*16) + ax1_1)
{
placeholder.d.shared.local_1: Buffer(placeholder.d.shared.local, float16, [2304], [], scope="local", align=64)[cse_var_4] = placeholder.d.shared_1[((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1)]
placeholder.d.shared.local_1[(cse_var_4 + 48)] = placeholder.d.shared_1[(((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1) + 6144)]
placeholder.d.shared.local_1[(cse_var_4 + 96)] = placeholder.d.shared_1[(((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1) + 12288)]
placeholder.d.shared.local_1[(cse_var_4 + 144)] = placeholder.d.shared_1[(((((threadIdx.y*384) + (ax0_1*128)) + (k.inner.outer*16)) + ax1_1) + 18432)]
}
}
}
for (k.inner.inner: int32, 0, 16) {
for (i.c: int32, 0, 8) {
for (j.c: int32, 0, 3) {
for (vthread.s_2: int32, 0, 1024) {
let cse_var_10: int32 = ((j.c*16) + k.inner.inner)
let cse_var_9: int32 = (((vthread.s_2*24) + (i.c*3)) + j.c)
let cse_var_8: int32 = (((vthread.s_2*128) + (i.c*16)) + k.inner.inner)
let cse_var_7: int32 = (cse_var_9 + 24576)
let cse_var_6: int32 = (cse_var_9 + 49152)
let cse_var_5: int32 = (cse_var_9 + 73728)
{
T_matmul_NT.local_1[cse_var_9] = (T_matmul_NT.local_1[cse_var_9] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[cse_var_10])))
T_matmul_NT.local_1[cse_var_7] = (T_matmul_NT.local_1[cse_var_7] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[(cse_var_10 + 48)])))
T_matmul_NT.local_1[cse_var_6] = (T_matmul_NT.local_1[cse_var_6] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[(cse_var_10 + 96)])))
T_matmul_NT.local_1[cse_var_5] = (T_matmul_NT.local_1[cse_var_5] + (cast(float32, placeholder.shared.local_1[cse_var_8])*cast(float32, placeholder.d.shared.local_1[(cse_var_10 + 144)])))
}
}
}
}
}
}
}
for (j.inner.inner.inner: int32, 0, 3) {
for (i.inner.inner.inner: int32, 0, 8) {
for (vthread.s_3: int32, 0, 1024) {
let cse_var_11: int32 = (((vthread.s_3*24) + (i.inner.inner.inner*3)) + j.inner.inner.inner)
{
T_matmul_NT[((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner)] = T_matmul_NT.local_1[cse_var_11]
T_matmul_NT[(((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner) + 48)] = T_matmul_NT.local_1[(cse_var_11 + 24576)]
T_matmul_NT[(((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner) + 96)] = T_matmul_NT.local_1[(cse_var_11 + 49152)]
T_matmul_NT[(((((((blockIdx.x*6291456) + (vthread.s_3*6144)) + (i.inner.inner.inner*768)) + (blockIdx.y*192)) + (threadIdx.y*3)) + j.inner.inner.inner) + 144)] = T_matmul_NT.local_1[(cse_var_11 + 73728)]
}
}
}
}
}
}
so move data and weight from global memory to shared memory, the strange tx, ty (1, 1) and (16, 16)
might be some strange
@masahi @tqchen
---
[Visit Topic](https://discuss.tvm.apache.org/t/bug-report-auto-dense-large-gpu-schedule/12320/1) to respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/d96111978ef2b4c4f1de9ef7c3c87c95d040056f908a37249820de1c67549202).