You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ju...@apache.org on 2023/01/26 23:54:41 UTC
[tvm] branch main updated: [ROCM] Fixes compiling on ROCM 5 and accuracy on dense op (#13847)

This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new ec72ac6690 [ROCM] Fixes compiling on ROCM 5 and accuracy on dense op (#13847)
ec72ac6690 is described below

commit ec72ac66900b0a1a661fcccff1f48ca48b5d5739
Author: masahi <ma...@gmail.com>
AuthorDate: Fri Jan 27 08:54:33 2023 +0900

    [ROCM] Fixes compiling on ROCM 5 and accuracy on dense op (#13847)
    
    Fixes https://github.com/apache/tvm/issues/13666
    
    * Some bitcode files need to be updated
    * There is a strange, device-dependent accuracy issue when using the default topi `dense` op schedule on AMDGPU (see the issue above). I confirmed that other schedule works fine.
---
 python/tvm/contrib/rocm.py           |  4 ++--
 python/tvm/relay/op/strategy/cuda.py | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 4f62f1a8da..372281dbab 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -141,9 +141,7 @@ def callback_rocm_bitcode_path(rocdl_dir=None):
     bitcode_names = [
         "oclc_daz_opt_on",
         "ocml",
-        "hc",
         "irif",  # this does not exist in rocm 3.9, drop eventually
-        "ockl",
         "oclc_correctly_rounded_sqrt_off",
         "oclc_correctly_rounded_sqrt_on",
         "oclc_daz_opt_off",
@@ -152,9 +150,11 @@ def callback_rocm_bitcode_path(rocdl_dir=None):
         "oclc_isa_version_803",  # todo (t-vi): an alternative might be to scan for the
         "oclc_isa_version_900",  #              isa version files (if the linker throws out
         "oclc_isa_version_906",  #              the unneeded ones or we filter for the arch we need)
+        "oclc_isa_version_1030",
         "oclc_unsafe_math_off",
         "oclc_unsafe_math_on",
         "oclc_wavefrontsize64_on",
+        "oclc_abi_version_500",
     ]
 
     bitcode_files = []
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index cc43809266..fa295c93a1 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -918,13 +918,16 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
             name="dense_int8.cuda",
         )
     else:
-        strategy.add_implementation(
-            wrap_compute_dense(topi.gpu.dense_small_batch),
-            wrap_topi_schedule(topi.gpu.schedule_dense_small_batch),
-            name="dense_small_batch.gpu",
-        )
+        # Some AMDGPU cards have accuracy issues with this schedule
+        # See https://github.com/apache/tvm/issues/13666
+        if target.kind.name != "rocm":
+            strategy.add_implementation(
+                wrap_compute_dense(topi.gpu.dense_small_batch),
+                wrap_topi_schedule(topi.gpu.schedule_dense_small_batch),
+                name="dense_small_batch.gpu",
+            )
 
-        with SpecializedCondition(b >= 32):
+        with SpecializedCondition(target.kind.name == "rocm" or b >= 32):
             strategy.add_implementation(
                 wrap_compute_dense(topi.gpu.dense_large_batch),
                 wrap_topi_schedule(topi.gpu.schedule_dense_large_batch),