You are viewing a plain text version of this content. The canonical link for it is here.
Posted to discuss-archive@tvm.apache.org by cali via Apache TVM Discuss <no...@discuss.tvm.ai> on 2021/03/05 11:24:45 UTC
[Apache TVM Discuss] [Questions] Use Tensorize to replace all code
Hi,
I would like to use tensorize to replace all the code, but I have "segmentation fault". My tensorize work if I use it when I split. But when I try to replace all the code I have this error. Do you know what this can come from?
Below my code I am using the tensorize_all variable to switch from using tensorize on tile loops to all code.
```python
from __future__ import absolute_import, print_function
import tvm
from tvm import te
import tvm.testing
import numpy as np
from tvm.topi.utils import get_const_tuple
tensorize_all = False
batch_size = 1
height = 32
width = 32
in_channels = 32
out_channels = 32
kernel_h = 3
kernel_w = 3
pad_h = 1
pad_w = 1
stride_h = 1
stride_w = 1
dilation_h = 1
dilation_w = 1
target = "llvm -mcpu=core-avx2"
ctx = tvm.context(target)
dtype = "float32"
A = te.placeholder((batch_size, in_channels, height + kernel_h - 1, width + kernel_w - 1), name="A")
W = te.placeholder((out_channels, in_channels, kernel_h, kernel_w), name="W")
axe_in_channels = te.reduce_axis((0, in_channels), name="axe_in_channels")
axe_kernel_h = te.reduce_axis((0, kernel_h), name="axe_kernel_h")
axe_kernel_w = te.reduce_axis((0, kernel_w), name="axe_kernel_w")
out_h = (height + 2 * pad_h - kernel_h) // stride_h + 1
out_w = (width + 2 * pad_w - kernel_w) // stride_w + 1
Out = te.compute(
(batch_size, out_channels, out_h, out_w),
lambda batch, out_channels, yy, xx: te.sum(
A[batch, axe_in_channels, yy * stride_h + axe_kernel_h * dilation_h, xx * stride_w + axe_kernel_w * dilation_w]* W[out_channels, axe_in_channels, axe_kernel_h, axe_kernel_w],
axis=[axe_in_channels, axe_kernel_h, axe_kernel_w],)
)
s = te.create_schedule(Out.op)
axe_batch, axe_out_channels, axe_yy, axe_xx = Out.op.axis
axe_in_channels, axe_kernel_h, axe_kernel_w = Out.op.reduce_axis
if not tensorize_all:
factor = 16
axe_out_channelso, axe_out_channelsi = s[Out].split(axe_out_channels, factor=factor)
axe_yyo, axe_yyi = s[Out].split(axe_yy, factor=factor)
axe_xxo, axe_xxi = s[Out].split(axe_xx, factor=factor)
axe_in_channelso, axe_in_channelsi = s[Out].split(axe_in_channels, factor=factor)
s[Out].reorder(axe_batch, axe_out_channelso, axe_yyo, axe_xxo, axe_in_channelso, axe_out_channelsi, axe_yyi, axe_xxi, axe_in_channelsi, axe_kernel_h, axe_kernel_w)
def conv_impl():
cc_code = """
extern "C" int Mconv2d(float* K, float * Input, float * Output, int W, int H, int C, int F, int X, int Y, int strideO1, int strideO2, int strideA1, int strideA2, int strideW1, int strideW2, int strideW3) {
/*
Simple Convolution
Inputs:
- K: weight
- Input: Input
- Output: Output
- W: kernel_w
- H: kernel_h
- C: in_channels
- F: out_channels
- X: width
- Y: height
*/
for (int f=0; f<F; f++){
for (int y=0; y<Y; y++){
for (int x=0; x<X; x++){
for (int c=0; c<C; c++){
for (int h=0; h<H; h++){
for (int w=0; w<W; w++){
Output[f*strideO1 + y*strideO2 + x] += Input[c*strideA1 + (y+h)*strideA2 + x + w] * K[f * strideW1 + c * strideW2 + h*strideW3 + w];
}
}
}
}
}
}
return 0;
}
extern "C" void Mconv2d_reset(float * Output, int F, int X, int Y, int strideO1, int strideO2) {
/*
Simple Convolution
Inputs:
- Output: Output
- F: out_channels
- X: width
- Y: height
*/
for (int f=0; f<F; f++){
for (int y=0; y<Y; y++){
for (int x=0; x<X; x++){
Output[f*strideO1 + y*strideO2 + x] = 0.0;
}
}
}
return;
}
"""
from tvm.contrib import utils, clang
temp = utils.tempdir()
ll_path = temp.relpath("temp.ll")
# Create LLVM ir from c source code
ll_code = clang.create_llvm(cc_code, output=ll_path)
return ll_code
def intrin_gemv(W, H, C, F, X, Y):
"""
W = kernel_w,
H = kernel_h,
C = in_channels,
F = out_channels,
X = width,
Y = height
"""
a = te.placeholder((1, C, Y + W - 1, X + H - 1), name="a")
w = te.placeholder((F, C, H, W), name="b")
axe_in_channels = te.reduce_axis((0, C), name="axe_in_channels")
axe_kernel_h = te.reduce_axis((0, H), name="axe_kernel_h")
axe_kernel_w = te.reduce_axis((0, W), name="axe_kernel_w")
strideA1 = tvm.te.var("sA1")
strideA2 = tvm.te.var("sA2")
strideA3 = tvm.te.var("sA3")
strideB1 = tvm.te.var("sB1")
strideB2 = tvm.te.var("sB2")
strideB3 = tvm.te.var("sB3")
strideC1 = tvm.te.var("sC1")
strideC2 = tvm.te.var("sC2")
strideC3 = tvm.te.var("sC3")
o = te.compute(
(1, F, Y, X),
lambda batch, out_channels, yy, xx: te.sum(
a[batch, axe_in_channels, yy + axe_kernel_h, xx + axe_kernel_w]* w[out_channels, axe_in_channels, axe_kernel_h, axe_kernel_w],
axis=[axe_in_channels, axe_kernel_h, axe_kernel_w],)
)
Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[strideA1, strideA2, strideA3, 1])
Ww = tvm.tir.decl_buffer(w.shape, w.dtype, name="W", offset_factor=1, strides=[strideB1, strideB2, strideB3, 1])
Oo = tvm.tir.decl_buffer(o.shape, o.dtype, name="O", offset_factor=1, strides=[strideC1, strideC2, strideC3, 1])
def intrin_func(ins, outs):
aa, bb = ins
cc = outs[0]
def _body():
ib = tvm.tir.ir_builder.create()
ib.emit(
tvm.tir.call_extern(
"int32",
"Mconv2d",
bb.access_ptr("r"),
aa.access_ptr("r"),
cc.access_ptr("w"),
W,
H,
C,
F,
Y,
X,
cc.strides[1],
cc.strides[2],
aa.strides[1],
aa.strides[2],
bb.strides[0],
bb.strides[1],
bb.strides[2],
)
)
return ib.get()
def _reduce_reset():
ib = tvm.tir.ir_builder.create()
ib.emit(
tvm.tir.call_extern(
"int32",
"Mconv2d_reset",
cc.access_ptr("w"),
F,
Y,
X,
cc.strides[1],
cc.strides[2],
)
)
return ib.get()
def _reduce_update():
return _body()
return _body(), _reduce_reset(), _reduce_update()
return te.decl_tensor_intrin(o.op, intrin_func, binds={a: Ab, w: Ww, o: Oo})
if tensorize_all:
gemv = intrin_gemv(kernel_h, kernel_w, in_channels, out_channels, width, height)
s[Out].tensorize(axe_batch, gemv)
s[Out].pragma(axe_batch, "import_llvm", conv_impl())
else:
gemv = intrin_gemv(kernel_h, kernel_w, factor, factor, factor, factor)
s[Out].tensorize(axe_out_channelsi, gemv)
s[Out].pragma(axe_batch, "import_llvm", conv_impl())
print(tvm.lower(s, [A, W, Out], simple_mode=True))
#build
func = tvm.build(s, [A, W, Out], target="llvm -mcpu=core-avx2", name="conv")
# random input
a = tvm.nd.array(np.random.uniform(size=(batch_size, in_channels, height + kernel_h - 1, width + kernel_w - 1)).astype(A.dtype), ctx)
w = tvm.nd.array(np.random.uniform(size=(out_channels, in_channels, kernel_h, kernel_w)).astype(W.dtype), ctx)
o = tvm.nd.array(np.ones(get_const_tuple(Out.shape), dtype=dtype), ctx)
# Run
func(tvm.nd.array(a, ctx), tvm.nd.array(w, ctx), o)
# Verify the result
oo = np.zeros(get_const_tuple(Out.shape), dtype=dtype)
def conv2d(weight, input_, output, batch_size, height, width, in_channels, out_channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w):
for b in range(batch_size):
for c in range(out_channels):
for y in range((height + 2 * pad_h - kernel_h) // stride_h + 1):
for x in range((width + 2 * pad_w - kernel_w) // stride_w + 1):
output[b, c, y, x] = 0
for k in range(in_channels):
for dy in range(kernel_h):
for dx in range(kernel_w):
output[b, c, y, x] += input_[b, k, stride_w * y + dy, stride_h * x + dx] * 1 * weight[c, k, dy, dx]
return output
output_conv2d = conv2d(w.asnumpy(), a.asnumpy(), oo, batch_size, height, width, in_channels, out_channels, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w)
tvm.testing.assert_allclose(o.asnumpy(), output_conv2d, rtol=1e-5)
print("ok")
```
Thank you for your help
---
[Visit Topic](https://discuss.tvm.apache.org/t/use-tensorize-to-replace-all-code/9316/1) to respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/e73b29c7cbf303bb16369e744860fc34f803b0d0ae5f323d325057610fe8eaad).