You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2020/04/29 17:22:34 UTC
[incubator-tvm] branch master updated: [CODEGEN][CUDA] Fix a bug when vectorized load&store was involved for… (#5428)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-tvm.git
The following commit(s) were added to refs/heads/master by this push:
new f1e87f1 [CODEGEN][CUDA] Fix a bug when vectorized load&store was involved for… (#5428)
f1e87f1 is described below
commit f1e87f1b18be048183bb8d09f417e320d46f8a34
Author: boh_inspur <61...@users.noreply.github.com>
AuthorDate: Wed Apr 29 12:22:25 2020 -0500
[CODEGEN][CUDA] Fix a bug when vectorized load&store was involved for… (#5428)
* [CODEGEN][CUDA] Fix a bug when vectorized load&store was involved for "char2"
* Add unittest for char2
* vector element load support char2&add some unittest for vector element load
* Merge common up logic&Support char3&Add unittest for char3
---
src/target/source/codegen_cuda.cc | 37 ++++++++++++++++-------
tests/python/unittest/test_target_codegen_cuda.py | 29 +++++++++++++-----
2 files changed, 48 insertions(+), 18 deletions(-)
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 56d7162..a911e6b 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -272,9 +272,17 @@ void CodeGenCUDA::PrintVecElemLoad(
static const char access[] = {'x', 'y', 'z', 'w'};
CHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
if ((t.is_int()) && t.bits() == 8) {
- os << "((char)(" << vec << " >> " << i * 8 << "))";
+ if (t.lanes() == 2 || t.lanes() == 3) {
+ os << vec << "." << access[i % t.lanes()];
+ } else {
+ os << "((char)(" << vec << " >> " << i * 8 << "))";
+ }
} else if ((t.is_uint()) && t.bits() == 8) {
- os << "((unsigned char)(" << vec << " >> " << i * 8 << "))";
+ if (t.lanes() == 2 || t.lanes() == 3) {
+ os << vec << "." << access[i % t.lanes()];
+ } else {
+ os << "((unsigned char)(" << vec << " >> " << i * 8 << "))";
+ }
} else if (t.is_float16()) {
os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->"
<< access[i % 2];
@@ -289,12 +297,17 @@ void CodeGenCUDA::PrintVecElemStore(
static const char access[] = {'x', 'y', 'z', 'w'};
CHECK(i >= 0 && i < (t.is_float16() ? 8 : 4));
if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
- stream << vec << "=";
- // Do not read the first undef lane.
- if (i != 0) {
- stream << vec << " & ~(0x000000ff << " << i * 8 << ") |";
+ if (t.lanes() == 2 || t.lanes() == 3) {
+ stream << vec << '.' << access[i % t.lanes()] << "="
+ << "(" << value << ");\n";
+ } else {
+ stream << vec << "=";
+ // Do not read the first undef lane.
+ if (i != 0) {
+ stream << vec << " & ~(0x000000ff << " << i * 8 << ") |";
+ }
+ stream << "(" << value << " << " << i * 8 << ");\n";
}
- stream << "(" << value << " << " << i * 8 << ");\n";
} else if (t.is_float16()) {
stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->"
<< access[i % 2] << " = " << value << ";\n";
@@ -789,11 +802,13 @@ void CodeGenCUDA::PrintVecElemLoadExpr(
DataType t, int i, const std::string& value, std::ostream& os) {
CHECK_GT(t.lanes(), 1);
if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
- if (i != 0) {
- os << "|";
+ if (!(t.lanes() == 2 || t.lanes() == 3)) {
+ if (i != 0) {
+ os << "|";
+ }
+ os << "((0x000000ff << " << i * 8 << ") & (" << value << " << " << i * 8 << "))";
+ return;
}
- os << "((0x000000ff << " << i * 8 << ") & (" << value << " << " << i * 8 << "))";
- return;
}
if (t.is_float16()) {
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index 49a7933..50705e8 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -55,7 +55,12 @@ def test_cuda_vectorize_add():
check_cuda("float32", 64, 2)
check_cuda("float32", 64, 3)
check_cuda("float32", 64, 4)
+ check_cuda("int8", 64, 2)
+ check_cuda("int8", 64, 3)
check_cuda("int8", 64, 4)
+ check_cuda("uint8", 64, 2)
+ check_cuda("uint8", 64, 3)
+ check_cuda("uint8", 64, 4)
check_cuda("float16", 64, 2)
check_cuda("float16", 64, 4)
check_cuda("float16", 64, 6)
@@ -112,15 +117,17 @@ def test_cuda_vectorize_load():
b = tvm.nd.empty((n,), B.dtype, ctx)
fun(a,b)
tvm.testing.assert_allclose(a.asnumpy(), b.asnumpy())
+ check_cuda("int8", 64, 2)
+ check_cuda("int8", 64, 3)
+ check_cuda("int8", 64, 4)
check_cuda("int8", 64, 8)
check_cuda("int8", 64, 16)
-def test_cuda_make_int8x4():
- def check_cuda(n, value):
+def test_cuda_make_int8():
+ def check_cuda(n, value, lanes):
if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
print("skip because cuda is not enabled..")
return
- lanes = 4
dtype = 'int8'
ctx = tvm.gpu(0)
A = te.compute((n, lanes), lambda i,j: tvm.tir.const(value, dtype=dtype))
@@ -133,9 +140,15 @@ def test_cuda_make_int8x4():
a = tvm.nd.empty(np_a.shape, dtype, ctx)
fun(a)
np.testing.assert_equal(a.asnumpy(), np_a)
- check_cuda(64, 0xAB)
- check_cuda(64, 0)
- check_cuda(64, -3)
+ check_cuda(64, 0xAB, 4)
+ check_cuda(64, 0, 4)
+ check_cuda(64, -3, 4)
+ check_cuda(64, 0xAB, 3)
+ check_cuda(64, 0, 3)
+ check_cuda(64, -3, 3)
+ check_cuda(64, 0xAB, 2)
+ check_cuda(64, 0, 2)
+ check_cuda(64, -3, 2)
def test_cuda_inf_nan():
@@ -579,6 +592,8 @@ def test_cuda_vectorize_load_permute_pad():
(0, 0)), mode='constant', constant_values=0)
tvm.testing.assert_allclose(b.asnumpy(), ref)
+ check_cuda("int8", 64, 16, 3, 2)
+ check_cuda("uint8", 64, 16, 3, 2)
check_cuda("int8", 64, 16, 3, 4)
check_cuda("uint8", 64, 16, 3, 4)
check_cuda("int32", 64, 16, 3, 4)
@@ -589,7 +604,7 @@ if __name__ == "__main__":
test_cuda_vectorize_add()
test_cuda_multiply_add()
test_cuda_vectorize_load()
- test_cuda_make_int8x4()
+ test_cuda_make_int8()
test_cuda_inf_nan()
test_cuda_shuffle()
test_vectorized_casts()