You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2021/03/10 18:53:25 UTC
[tvm] branch main updated: [CUDA][TOPI] Fix
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (#7623)
This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 829f44c [CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (#7623)
829f44c is described below
commit 829f44c9c838c7ad5c6344754e45bccefc545b2c
Author: Trevor Morris <tr...@amazon.com>
AuthorDate: Wed Mar 10 10:53:07 2021 -0800
[CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (#7623)
* Use less threads for certain GPUs to avoid register limit
* Move util function to nvcc.py
* Fix lint
---
python/tvm/contrib/nvcc.py | 41 +++++++++++++++++++++++++++++++++++++++++
python/tvm/topi/cuda/nms.py | 9 +++++++++
2 files changed, 50 insertions(+)
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index f33603b..7e49f55 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -216,6 +216,47 @@ def callback_libdevice_path(arch):
return ""
+def get_target_compute_version(target=None):
+ """Utility function to get compute capability of compilation target.
+
+ Looks for the arch in three different places, first in the target attributes, then the global
+ scope, and finally the GPU device (if it exists).
+
+ Parameters
+ ----------
+ target : tvm.target.Target, optional
+ The compilation target
+
+ Returns
+ -------
+ compute_version : str
+ compute capability of a GPU (e.g. "8.0")
+ """
+ # 1. Target
+ if target:
+ if "arch" in target.attrs:
+ compute_version = target.attrs["arch"]
+ major, minor = compute_version.split("_")[1]
+ return major + "." + minor
+
+ # 2. Global scope
+ from tvm.autotvm.env import AutotvmGlobalScope # pylint: disable=import-outside-toplevel
+
+ if AutotvmGlobalScope.current.cuda_target_arch:
+ major, minor = AutotvmGlobalScope.current.cuda_target_arch.split("_")[1]
+ return major + "." + minor
+
+ # 3. GPU
+ if tvm.gpu(0).exist:
+ return tvm.gpu(0).compute_version
+
+ warnings.warn(
+ "No CUDA architecture was specified or GPU detected."
+ "Try specifying it by adding '-arch=sm_xx' to your target."
+ )
+ return None
+
+
def parse_compute_version(compute_version):
"""Parse compute capability string to divide major and minor version
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 83b5385..ccc2ec9 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -19,6 +19,7 @@
"""Non-maximum suppression operator"""
import tvm
from tvm import te
+from tvm.contrib import nvcc
from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
from tvm.tir import if_then_else
from .sort import argsort, argsort_thrust
@@ -493,6 +494,14 @@ def nms_ir(
nthread_by = batch_size
nthread_tx = max_threads
+ # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
+ # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
+ # be exceeded with 1024 threads.
+ target = tvm.target.Target.current(allow_none=False)
+ if target.kind.name == "cuda":
+ if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]:
+ nthread_tx = 512
+
by = te.thread_axis("blockIdx.y")
tx = te.thread_axis("threadIdx.x")
ib.scope_attr(by, "thread_extent", nthread_by)