You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2021/03/10 18:53:25 UTC
[tvm] branch main updated: [CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (#7623)

This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 829f44c  [CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (#7623)
829f44c is described below

commit 829f44c9c838c7ad5c6344754e45bccefc545b2c
Author: Trevor Morris <tr...@amazon.com>
AuthorDate: Wed Mar 10 10:53:07 2021 -0800

    [CUDA][TOPI] Fix CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES with NMS for certain GPUs (#7623)
    
    * Use less threads for certain GPUs to avoid register limit
    
    * Move util function to nvcc.py
    
    * Fix lint
---
 python/tvm/contrib/nvcc.py  | 41 +++++++++++++++++++++++++++++++++++++++++
 python/tvm/topi/cuda/nms.py |  9 +++++++++
 2 files changed, 50 insertions(+)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index f33603b..7e49f55 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -216,6 +216,47 @@ def callback_libdevice_path(arch):
         return ""
 
 
+def get_target_compute_version(target=None):
+    """Utility function to get compute capability of compilation target.
+
+    Looks for the arch in three different places, first in the target attributes, then the global
+    scope, and finally the GPU device (if it exists).
+
+    Parameters
+    ----------
+    target : tvm.target.Target, optional
+        The compilation target
+
+    Returns
+    -------
+    compute_version : str
+        compute capability of a GPU (e.g. "8.0")
+    """
+    # 1. Target
+    if target:
+        if "arch" in target.attrs:
+            compute_version = target.attrs["arch"]
+            major, minor = compute_version.split("_")[1]
+            return major + "." + minor
+
+    # 2. Global scope
+    from tvm.autotvm.env import AutotvmGlobalScope  # pylint: disable=import-outside-toplevel
+
+    if AutotvmGlobalScope.current.cuda_target_arch:
+        major, minor = AutotvmGlobalScope.current.cuda_target_arch.split("_")[1]
+        return major + "." + minor
+
+    # 3. GPU
+    if tvm.gpu(0).exist:
+        return tvm.gpu(0).compute_version
+
+    warnings.warn(
+        "No CUDA architecture was specified or GPU detected."
+        "Try specifying it by adding '-arch=sm_xx' to your target."
+    )
+    return None
+
+
 def parse_compute_version(compute_version):
     """Parse compute capability string to divide major and minor version
 
diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index 83b5385..ccc2ec9 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -19,6 +19,7 @@
 """Non-maximum suppression operator"""
 import tvm
 from tvm import te
+from tvm.contrib import nvcc
 from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
 from tvm.tir import if_then_else
 from .sort import argsort, argsort_thrust
@@ -493,6 +494,14 @@ def nms_ir(
         nthread_by = batch_size
         nthread_tx = max_threads
 
+        # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
+        # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
+        # be exceeded with 1024 threads.
+        target = tvm.target.Target.current(allow_none=False)
+        if target.kind.name == "cuda":
+            if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]:
+                nthread_tx = 512
+
         by = te.thread_axis("blockIdx.y")
         tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(by, "thread_extent", nthread_by)