You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2021/06/18 06:58:45 UTC
[tvm] branch main updated: [CUDA][PASS] conv2d NWHC/HWNC legalize
tensorcore (#8222)
This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 5f94c1e [CUDA][PASS] conv2d NWHC/HWNC legalize tensorcore (#8222)
5f94c1e is described below
commit 5f94c1e0fa7ccd3e6947074a594f136a43da9cce
Author: Wang Yucheng <wy...@163.com>
AuthorDate: Fri Jun 18 14:58:31 2021 +0800
[CUDA][PASS] conv2d NWHC/HWNC legalize tensorcore (#8222)
* add conv2d leg
* minor fix
* fix pylint
* fix pylint
Co-authored-by: wangyucheng <wa...@sensetime.com>
---
python/tvm/topi/cuda/conv2d_alter_op.py | 169 ++++++++++++++++++---
python/tvm/topi/cuda/tensorcore_alter_op.py | 10 +-
.../python/relay/test_pass_legalize_tensorcore.py | 127 +++++++++++++---
3 files changed, 263 insertions(+), 43 deletions(-)
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index 067f272..4863a06 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -270,6 +270,60 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
return None
+def _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor):
+ # Pad batch size
+ if db != 0:
+ data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, db), (0, 0)))
+
+ # Pad input channel
+ if di != 0:
+ data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+ kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+
+ # Pad output channel
+ if do != 0:
+ kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, do), (0, 0)))
+
+ if do != 0:
+ new_out_channel = out_channel + do
+ new_attrs["channels"] = new_out_channel
+
+ out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+ if db != 0 or do != 0:
+ original_out_shape = [x.value for x in output_tensor.shape]
+ out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+
+ return out
+
+
+def _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor):
+ # Pad batch size
+ if db != 0:
+ data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0)))
+
+ # Pad input channel
+ if di != 0:
+ data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
+ kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0)))
+
+ # Pad output channel
+ if do != 0:
+ kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do)))
+
+ if do != 0:
+ new_out_channel = out_channel + do
+ new_attrs["channels"] = new_out_channel
+
+ out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+ if db != 0 or do != 0:
+ original_out_shape = [x.value for x in output_tensor.shape]
+ out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+
+ return out
+
+
@conv2d_legalize.register("cuda")
def _conv2d_legalize(attrs, inputs, arg_types):
"""Legalizes Conv2D op.
@@ -347,7 +401,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
else:
out = relay.nn.conv2d(data, kernel, **new_attrs)
return out
- elif data_dtype in ["float16"]: # todo: support int8/int4
+
if data_layout == "NHWC" and kernel_layout == "HWIO":
batch = data_tensor.shape[0].value
in_channel = data_tensor.shape[3].value
@@ -361,7 +415,10 @@ def _conv2d_legalize(attrs, inputs, arg_types):
# no need to pad
return None
- (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel)
+ candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+ (db, di, do), extra_flops = pad_to_tensorcore(
+ batch, in_channel, out_channel, candidates
+ )
if extra_flops > 2:
logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -369,28 +426,100 @@ def _conv2d_legalize(attrs, inputs, arg_types):
logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
- # Pad batch size
- if db != 0:
- data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0)))
+ return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
- # Pad input channel
- if di != 0:
- data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di)))
- kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0)))
+ if data_layout == "HWNC" and kernel_layout == "HWOI":
+ batch = data_tensor.shape[2].value
+ in_channel = data_tensor.shape[3].value
+ out_channel = kernel_tensor.shape[2].value
- # Pad output channel
- if do != 0:
- kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do)))
+ if batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0:
+ return None
- if do != 0:
- new_out_channel = out_channel + do
- new_attrs["channels"] = new_out_channel
+ candidates = [(8, 16, 32)]
+ (db, di, do), extra_flops = pad_to_tensorcore(
+ batch, in_channel, out_channel, candidates
+ )
- out = relay.nn.conv2d(data, kernel, **new_attrs)
+ if extra_flops > 2:
+ logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+ return None
+ logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
- if db != 0 or do != 0:
- original_out_shape = [x.value for x in output_tensor.shape]
- out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+ return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+
+ elif data_dtype in ["float16"]:
+ if data_layout == "NHWC" and kernel_layout == "HWIO":
+ batch = data_tensor.shape[0].value
+ in_channel = data_tensor.shape[3].value
+ out_channel = kernel_tensor.shape[3].value
+
+ if (
+ (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
+ or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
+ or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
+ ):
+ # no need to pad
+ return None
+
+ candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+ (db, di, do), extra_flops = pad_to_tensorcore(
+ batch, in_channel, out_channel, candidates
+ )
+
+ if extra_flops > 2:
+ logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+ return None
+
+ logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
+
+ return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+
+ elif data_dtype in ["int4", "uint4"]:
+ if data_layout == "NHWC" and kernel_layout == "HWIO":
+ batch = data_tensor.shape[0].value
+ in_channel = data_tensor.shape[3].value
+ out_channel = kernel_tensor.shape[3].value
+
+ if (
+ (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0)
+ or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0)
+ or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0)
+ ):
+ # no need to pad
+ return None
+
+ candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+ (db, di, do), extra_flops = pad_to_tensorcore(
+ batch, in_channel, out_channel, candidates
+ )
+
+ if extra_flops > 2:
+ logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+ return None
+
+ logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
+
+ return _pad_conv2d_NHWC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
+
+ if data_layout == "HWNC" and kernel_layout == "HWOI":
+ batch = data_tensor.shape[2].value
+ in_channel = data_tensor.shape[3].value
+ out_channel = kernel_tensor.shape[2].value
+
+ if batch % 8 == 0 and in_channel % 32 == 0 and out_channel % 8 == 0:
+ return None
+
+ candidates = [(8, 32, 8)]
+ (db, di, do), extra_flops = pad_to_tensorcore(
+ batch, in_channel, out_channel, candidates
+ )
+
+ if extra_flops > 2:
+ logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops)
+ return None
+ logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops)
+
+ return _pad_conv2d_HWNC(db, di, do, data, kernel, out_channel, new_attrs, output_tensor)
- return out
return None
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index aec7acb..eb7c71d 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -71,7 +71,8 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
# no need to pad
return None
- (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N)
+ candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+ (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
if extra_flops > 2:
logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops)
@@ -145,7 +146,8 @@ def _dense_legalize(attrs, inputs, arg_types):
# no need to pad
return None
- (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N)
+ candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
+ (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
if extra_flops_ratio > 2:
logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
@@ -171,10 +173,8 @@ def _dense_legalize(attrs, inputs, arg_types):
return None
-def pad_to_tensorcore(M, K, N):
+def pad_to_tensorcore(M, K, N, candidates):
"""pad shape to enable tensorcore"""
- candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-
flops = M * K * N
extra_flops = math.inf
best_pad = (0, 0, 0)
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index f45e390..1312b39 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -36,18 +36,18 @@ def run_opt_pass(expr, passes):
@tvm.testing.uses_gpu
-def test_legalize_conv2d():
- """test legalize conv2d to enable tensorcore"""
+def test_legalize_conv2d_NHWC():
+ """test legalize NHWC conv2d to enable tensorcore"""
- def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True):
+ def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
out_channel = kernel_shape[3]
out_shape = list(data_shape)
out_shape[3] = out_channel
db, di, do = pad_shape
def before():
- x = relay.var("x", shape=data_shape, dtype="float16")
- weight = relay.var("weight", shape=kernel_shape, dtype="float16")
+ x = relay.var("x", shape=data_shape, dtype=dtype)
+ weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
y = relay.nn.conv2d(
x,
weight,
@@ -67,12 +67,12 @@ def test_legalize_conv2d():
def expected():
if not do_pad:
return before()
- x = relay.var("x", shape=data_shape, dtype="float16")
+ x = relay.var("x", shape=data_shape, dtype=dtype)
if db or di:
x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di)))
else:
x_pad = x
- weight = relay.var("weight", shape=(kernel_shape), dtype="float16")
+ weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
if di or do:
weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do)))
else:
@@ -99,19 +99,109 @@ def test_legalize_conv2d():
b = run_opt_pass(expected(), transform.InferType())
assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+ for dtype in ["float16", "int8", "int4"]:
+ # conv2d pad batch
+ _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0), dtype)
+ _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0), dtype)
+ _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), dtype, False)
+ # conv2d pad in_channel
+ _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0), dtype)
+ _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0), dtype)
+ _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0), dtype)
+ _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), dtype, False)
+ # conv2d pad out_channel
+ _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1), dtype)
+ _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31), dtype)
+ _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), dtype, False)
+
+
+@tvm.testing.uses_gpu
+def test_legalize_conv2d_HWNC():
+ """test legalize HWNC conv2d to enable tensorcore"""
+
+ def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
+ out_channel = kernel_shape[2]
+ out_shape = list(data_shape)
+ out_shape[3] = out_channel
+ db, di, do = pad_shape
+
+ def before():
+ x = relay.var("x", shape=data_shape, dtype=dtype)
+ weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
+ y = relay.nn.conv2d(
+ x,
+ weight,
+ channels=out_channel,
+ kernel_size=(3, 3),
+ padding=(1, 1),
+ data_layout="HWNC",
+ kernel_layout="HWOI",
+ )
+ y = relay.Function([x, weight], y)
+ return y
+
+ def legalize_conv2d(attrs, inputs, types):
+ with tvm.target.Target("cuda"):
+ return topi.nn.conv2d_legalize(attrs, inputs, types)
+
+ def expected():
+ if not do_pad:
+ return before()
+ x = relay.var("x", shape=data_shape, dtype=dtype)
+ if db or di:
+ x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, 0), (0, db), (0, di)))
+ else:
+ x_pad = x
+ weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
+ if di or do:
+ weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, do), (0, di)))
+ else:
+ weight_pad = weight
+ y_pad = relay.nn.conv2d(
+ x_pad,
+ weight=weight_pad,
+ channels=out_channel + do,
+ kernel_size=(3, 3),
+ padding=(1, 1),
+ data_layout="HWNC",
+ kernel_layout="HWOI",
+ )
+ if db or do:
+ y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape)
+ else:
+ y = y_pad
+ y = relay.Function([x, weight], y)
+ return y
+
+ with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d):
+ a = before()
+ a = run_opt_pass(a, transform.Legalize())
+ b = run_opt_pass(expected(), transform.InferType())
+ assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
+
# conv2d pad batch
- _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0))
- _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0))
- _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False)
+ _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int8")
+ _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int8")
+ _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), "int8", False)
+ _test_legalize_conv2d((16, 16, 7, 64), (3, 3, 64, 64), (1, 0, 0), "int4")
+ _test_legalize_conv2d((16, 16, 3, 64), (3, 3, 64, 64), (5, 0, 0), "int4")
+ _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), "int4", False)
# conv2d pad in_channel
- _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0))
- _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0))
- _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0))
- _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False)
+ _test_legalize_conv2d((16, 16, 8, 63), (3, 3, 64, 63), (0, 1, 0), "int8")
+ _test_legalize_conv2d((16, 16, 8, 33), (3, 3, 64, 33), (0, 15, 0), "int8")
+ _test_legalize_conv2d((16, 16, 8, 13), (3, 3, 64, 13), (0, 3, 0), "int8")
+ _test_legalize_conv2d((16, 16, 8, 1), (3, 3, 64, 1), (0, 0, 0), "int8", False)
+ _test_legalize_conv2d((16, 16, 8, 63), (3, 3, 64, 63), (0, 1, 0), "int4")
+ _test_legalize_conv2d((16, 16, 8, 33), (3, 3, 64, 33), (0, 31, 0), "int4")
+ _test_legalize_conv2d((16, 16, 8, 13), (3, 3, 64, 13), (0, 19, 0), "int4")
+ _test_legalize_conv2d((16, 16, 8, 1), (3, 3, 64, 1), (0, 0, 0), "int4", False)
# conv2d pad out_channel
- _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1))
- _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31))
- _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False)
+ _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 63, 64), (0, 0, 1), "int8")
+ _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 33, 64), (0, 0, 31), "int8")
+ _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 1, 64), (0, 0, 0), "int8", False)
+ _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 63, 64), (0, 0, 1), "int4")
+ _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 33, 64), (0, 0, 7), "int4")
+ _test_legalize_conv2d((16, 16, 8, 64), (3, 3, 1, 64), (0, 0, 0), "int4", False)
@tvm.testing.uses_gpu
@@ -234,6 +324,7 @@ def test_legalize_batch_matmul():
if __name__ == "__main__":
- test_legalize_conv2d()
+ test_legalize_conv2d_NHWC()
+ test_legalize_conv2d_HWNC()
test_legalize_dense()
test_legalize_batch_matmul()