You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2021/11/02 21:23:17 UTC
[tvm] branch main updated: BUG: alloc_tensor offset and reshape
shape should be on the CPU (#9421)
This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new aeff3ea BUG: alloc_tensor offset and reshape shape should be on the CPU (#9421)
aeff3ea is described below
commit aeff3ea711807ebd031ebd469641c45bdbd02360
Author: Mark Shields <87...@users.noreply.github.com>
AuthorDate: Tue Nov 2 14:22:32 2021 -0700
BUG: alloc_tensor offset and reshape shape should be on the CPU (#9421)
* BUG: alloc_tensor offset and reshape shape should be on the CPU
The VM ManifestAlloc pass was allocating constants in a few places I
forgot to tag with on_device for the host/cpu. As a result the runtime
would (silently) do the x-device copy, which destroys perf.
To make this easier to spot in the future added a 'constants' property
to the VM Executable to dump the shape & device for all VM constants.
This is CORE-102 in OctoML JIRA.
* [checkpoint] Older compilers can't handle << overload
* [checkpoint] Woops, forgot requires_cuda
---
include/tvm/runtime/vm/executable.h | 7 +++++
python/tvm/runtime/vm.py | 7 +++++
src/relay/transforms/memory_alloc.cc | 15 +++++----
src/runtime/vm/executable.cc | 30 ++++++++++++++++++
tests/python/relay/test_vm.py | 61 ++++++++++++++++++++++++++++++++++++
5 files changed, 114 insertions(+), 6 deletions(-)
diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 2cdd180..6e564fd 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -132,6 +132,13 @@ class Executable : public ModuleNode {
std::string GetBytecode() const;
/*!
+ * \brief Returns a description of all the contants in the executable in human-readable
+ * format. Not intended to be machine readable, but rather to help with debugging and
+ * diffing generated code.
+ */
+ std::string GetConstants() const;
+
+ /*!
* \brief Print the detailed statistics of the given code, i.e. number of
* globls and constants, etc.
*/
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 8ebb0f6..c1cbc96 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -71,6 +71,7 @@ class Executable(object):
self._save = self.mod["save"]
self._get_lib = self.mod["get_lib"]
self._get_bytecode = self.mod["get_bytecode"]
+ self._get_constants = self.mod["get_constants"]
self._get_stats = self.mod["get_stats"]
self._get_function_arity = self.mod["get_function_arity"]
self._get_function_param_name = self.mod["get_function_param_name"]
@@ -245,6 +246,12 @@ class Executable(object):
return self._get_bytecode()
@property
+ def constants(self):
+ """Returns a human-readable description of all the constants in the executable.
+ Useful for debugging and diffing generated executables in unit tests."""
+ return self._get_constants()
+
+ @property
def globals(self):
"""Get the globals used by the Relay VM executable.
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index dd582de..81d704e 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -62,8 +62,9 @@ inline Constant MakeConstant(const std::vector<int64_t>& value) {
}
inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype,
- Array<IndexExpr> assert_shape) {
- auto offset = MakeConstantScalar(DataType::Int(64), 0);
+ Array<IndexExpr> assert_shape, DLDeviceType offset_device_type) {
+ auto offset =
+ OnDevice(MakeConstantScalar(DataType::Int(64), 0), offset_device_type, /*is_fixed=*/true);
return AllocTensor(storage, offset, shape, dtype, assert_shape);
}
@@ -267,8 +268,9 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
auto sto = scope->Push(var, value);
// TODO(@jroesch): There is a bug with typing based on the constant shape.
- auto tensor = OnDevice(AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape),
- dev.device_type, /*is_fixed=*/true);
+ auto tensor = OnDevice(
+ AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape, cpu_device_.device_type),
+ dev.device_type, /*is_fixed=*/true);
Var tensor_var("tensor_" + name_hint, Type(nullptr));
return scope->Push(tensor_var, tensor);
}
@@ -367,7 +369,8 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
auto out_shape = out_shapes[i];
auto out_type = out_types[i];
auto storage = storages[i];
- auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape),
+ auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape,
+ cpu_device_.device_type),
dev.device_type, /*is_fixed=*/true);
Var out_var("out_" + std::to_string(i), Type(nullptr));
outs.push_back(scope->Push(out_var, alloc));
@@ -394,7 +397,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
CHECK(imm) << "expect static int shape";
shape.push_back(imm->value);
}
- shape_expr = MakeConstant(shape);
+ shape_expr = OnDevice(MakeConstant(shape), cpu_device_.device_type, /*is_fixed=*/true);
}
return ReshapeTensor(new_args[0], shape_expr, ret_ty->shape);
}
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 15d2aa0..4d7ee45 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -61,6 +61,8 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr<Obje
} else if (name == "get_bytecode") {
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetBytecode(); });
+ } else if (name == "get_constants") {
+ return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetConstants(); });
} else if (name == "get_stats") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->Stats(); });
} else if (name == "save") {
@@ -146,6 +148,34 @@ std::string Executable::GetBytecode() const {
return oss.str();
}
+namespace {
+String ShapeString(const ShapeTuple& shape_tuple, DLDataType dtype) {
+ std::stringstream sizes;
+ sizes << DLDataType2String(dtype) << "[";
+ for (size_t i = 0; i < shape_tuple.size(); i++) {
+ if (i != 0) {
+ sizes << ", ";
+ }
+ sizes << shape_tuple.data()[i];
+ }
+ sizes << "]";
+ return String(sizes.str());
+}
+} // namespace
+
+std::string Executable::GetConstants() const {
+ std::ostringstream oss;
+
+ for (size_t i = 0; i < constants.size(); ++i) {
+ const auto& constant = constants[i];
+ auto ndarray = Downcast<NDArray>(constant);
+ DLDeviceType device_type = static_cast<DLDeviceType>(const_device_type[i]);
+ oss << "VM Constant[" << i << "]: has shape " << ShapeString(ndarray.Shape(), ndarray->dtype)
+ << " on device of type " << device_type << std::endl;
+ }
+ return oss.str();
+}
+
std::string Executable::Stats() const {
std::ostringstream oss;
oss << "Relay VM executable statistics:" << std::endl;
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 52a2fef..7997974 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -999,6 +999,67 @@ def test_shape_func_nested_function():
compiler.lower(mod, "llvm")
+@tvm.testing.requires_cuda
+def test_storage_size_and_offset_on_cpu():
+ """Tests allocations place sizes and offsets on the CPU host even if the rest
+ of the computation is on a different device type."""
+ # TODO(mbs): Better would be to test ManifestAlloc independently.
+
+ # CPU = device type 1
+ # GPU = device type 2
+ def input():
+ return tvm.parser.fromtext(
+ """
+ #[version = "0.0.5"]
+ def @main(%a: Tensor[(5, 7), float32],
+ param_device_types=[2], result_device_type=2) {
+ add(%a, %a)
+ }
+ """
+ )
+
+ exe = relay.vm.compile(
+ input(),
+ tvm.target.Target("cuda"),
+ )
+
+ # This program needs two constants:
+ # - The size of the tensor's storage (first arg) to alloc_storage
+ # - The offset of the tensor within the storage (second arg) to alloc_tensor
+ # Both should be on the CPU
+ assert not "on device of type 2" in exe.constants
+ assert "on device of type 1" in exe.constants
+
+
+@tvm.testing.requires_cuda
+def test_reshape_shape_on_cpu():
+ """Tests the argument to a reshape places the shape on the CPU host even if the rest
+ of the computation is on a different device type."""
+ # TODO(mbs): Better would be to test ManifestAlloc independently.
+
+ # CPU = device type 1
+ # GPU = device type 2
+ def input():
+ return tvm.parser.fromtext(
+ """
+ #[version = "0.0.5"]
+ def @main(%x: Tensor[(2, 8), float32],
+ param_device_types=[2], result_device_type=2) {
+ reshape(%x, newshape=[2, 4, 2])
+ }
+ """
+ )
+
+ exe = relay.vm.compile(
+ input(),
+ tvm.target.Target("cuda"),
+ )
+
+ # The newshape annotation should have been turned into a constant on the CPU.
+ assert not "on device of type 2" in exe.constants
+ assert "on device of type 1" in exe.constants
+
+
if __name__ == "__main__":
import sys