You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by lm...@apache.org on 2020/04/04 03:46:38 UTC
[incubator-tvm] branch master updated: Fix intel conv2d auto tune
(#5200)
This is an automated email from the ASF dual-hosted git repository.
lmzheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-tvm.git
The following commit(s) were added to refs/heads/master by this push:
new 0cfdecd Fix intel conv2d auto tune (#5200)
0cfdecd is described below
commit 0cfdecdae99582998dae5c2c3fdfd7a2700f10c0
Author: Yao Wang <ke...@gmail.com>
AuthorDate: Fri Apr 3 20:46:32 2020 -0700
Fix intel conv2d auto tune (#5200)
* Fix x86 conv2d and depthwise conv2d auto tuning
* Fix depthwise conv2d infer layout
* Use random data instead of empty data for autotvm
* Fix pylint
* Keep empty array for now for autotvm
---
topi/python/topi/x86/conv2d.py | 14 ++++++++++++-
topi/python/topi/x86/conv2d_avx_1x1.py | 10 ++-------
topi/python/topi/x86/conv2d_avx_common.py | 10 ++-------
topi/python/topi/x86/depthwise_conv2d.py | 35 +++++++++++++++++--------------
4 files changed, 36 insertions(+), 33 deletions(-)
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 81d848a..d875f8d 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -185,7 +185,19 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo
# Pack data if raw 4-D data is provided.
# This can only happen when autotuning.
if len(data.shape) == 4:
- data, kernel = _pack_data(cfg, data, kernel)
+ if autotvm.GLOBAL_SCOPE.in_tuning:
+ # Directly use modified data layout placeholder.
+ dshape = (n, in_channel // cfg["tile_ic"].size[-1],
+ ih, iw, cfg["tile_ic"].size[-1])
+ data = tvm.te.placeholder(dshape, data.dtype, name="data")
+ kshape = (num_filter // cfg["tile_oc"].size[-1],
+ in_channel // cfg["tile_ic"].size[-1],
+ kernel_height, kernel_width,
+ cfg["tile_ic"].size[-1],
+ cfg["tile_oc"].size[-1])
+ kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel")
+ else:
+ data, kernel = _pack_data(cfg, data, kernel)
return nn.conv2d_NCHWc(data,
kernel,
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 432f8b2..978c4b9 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -19,7 +19,6 @@
from __future__ import absolute_import as _abs
import tvm
from tvm import te
-from tvm import autotvm
from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
from ..nn.pad import pad
@@ -69,17 +68,12 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
if isinstance(s[data_vec].op, tvm.te.ComputeOp) \
and "pad" in data_vec.op.tag:
batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+ s[data_vec].vectorize(ic_block)
parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
s[data_vec].parallel(parallel_axis)
data_vec = data_vec.op.input_tensors[0]
- if autotvm.GLOBAL_SCOPE.in_tuning:
- # only in autotuning, input data of conv2d_NCHWc will be 4-D.
- # skip this part during tuning to make records accurate.
- # this part will be folded during Relay fold_constant pass.
- s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
- s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
- elif isinstance(kernel_vec.op, tvm.te.ComputeOp) and \
+ if isinstance(kernel_vec.op, tvm.te.ComputeOp) and \
kernel_vec.name == 'kernel_vec':
# data and kernel are not pre-computed, schedule layout transform here.
# this should only be used by x86 conv2d_nchw, which is for
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index ebed14c..a88d168 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -17,7 +17,6 @@
# pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
"""Conv2D schedule on for Intel CPU"""
import tvm
-from tvm import autotvm
from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
from ..generic import conv2d as conv2d_generic
@@ -91,17 +90,12 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
if isinstance(s[data_vec].op, tvm.te.ComputeOp) \
and "pad" in data_vec.op.tag:
batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+ s[data_vec].vectorize(ic_block)
parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
s[data_vec].parallel(parallel_axis)
data_vec = data_vec.op.input_tensors[0]
- if autotvm.GLOBAL_SCOPE.in_tuning:
- # only in autotuning, input data of conv2d_NCHWc will be 4-D.
- # skip this part during tuning to make records accurate.
- # this part will be folded during Relay fold_constant pass.
- s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
- s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
- elif isinstance(kernel_vec.op, tvm.te.ComputeOp) and \
+ if isinstance(kernel_vec.op, tvm.te.ComputeOp) and \
kernel_vec.name == 'kernel_vec':
# data and kernel are not pre-computed, schedule layout transform here.
# this should only be used by x86 conv2d_nchw, which is for
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
index 240dee0..0976c33 100644
--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -43,7 +43,6 @@ def _fallback_schedule(cfg, wkl):
HPAD, WPAD = wkl.hpad, wkl.wpad
HSTR, WSTR = wkl.hstride, wkl.wstride
- out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
oc_bn = 1
@@ -148,10 +147,21 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
# Pack data if raw 4-D data is provided.
# This can only happen when autotuning.
if len(data.shape) == 4:
- data, kernel = _pack_data(cfg, data, kernel)
- _, _, _, _, in_channel_block = get_const_tuple(data.shape)
- out_channel_chunk, _, _, _, _, out_channel_block \
- = get_const_tuple(kernel.shape)
+ if autotvm.GLOBAL_SCOPE.in_tuning:
+ # Directly use modified data layout placeholder.
+ in_channel_block = cfg["tile_ic"].size[-1]
+ in_channel_chunk = in_channel // in_channel_block
+ out_channel_block = cfg["tile_oc"].size[-1]
+ out_channel_chunk = out_channel // out_channel_block
+ dshape = (batch, in_channel_chunk, in_height, in_width, in_channel_block)
+ data = tvm.te.placeholder(dshape, data.dtype, name="data")
+ kshape = (out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block)
+ kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel")
+ else:
+ data, kernel = _pack_data(cfg, data, kernel)
+ _, _, _, _, in_channel_block = get_const_tuple(data.shape)
+ out_channel_chunk, _, _, _, _, out_channel_block \
+ = get_const_tuple(kernel.shape)
# padding stage
DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0)
@@ -207,16 +217,9 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out
if isinstance(s[data_vec].op, tvm.te.ComputeOp) \
and "pad" in data_vec.op.tag:
batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+ s[data_vec].vectorize(ic_block)
parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
s[data_vec].parallel(parallel_axis)
- data_vec = data_vec.op.input_tensors[0]
-
- if autotvm.GLOBAL_SCOPE.in_tuning:
- # only in autotuning, input data of conv2d_NCHWc will be 4-D.
- # skip this part during tuning to make recrods accurate.
- # this part will be folded during Relay fold_constant pass.
- s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
- s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
C, O = conv_out, output
CC = s.cache_write(C, 'global')
@@ -264,12 +267,12 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out
@depthwise_conv2d_infer_layout.register("cpu")
def _depthwise_conv2d_infer_layout(workload, cfg):
- _, data, kernel, strides, padding, dilation, dtype = workload
+ _, data, kernel, strides, padding, dilation, _, _, dtype = workload
batch_size, in_channel, in_height, in_width = data[1]
filter_channel, channel_multiplier, k_height, k_width = kernel[1]
out_channel = filter_channel * channel_multiplier
- out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
- out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
+ out_height = (in_height + padding[0] + padding[2] - k_height) // strides[0] + 1
+ out_width = (in_width + padding[1] + padding[3] - k_width) // strides[1] + 1
tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
in_layout = "NCHW%dc" % tile_ic