You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2021/11/02 21:23:17 UTC
[tvm] branch main updated: BUG: alloc_tensor offset and reshape shape should be on the CPU (#9421)

This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new aeff3ea  BUG: alloc_tensor offset and reshape shape should be on the CPU (#9421)
aeff3ea is described below

commit aeff3ea711807ebd031ebd469641c45bdbd02360
Author: Mark Shields <87...@users.noreply.github.com>
AuthorDate: Tue Nov 2 14:22:32 2021 -0700

    BUG: alloc_tensor offset and reshape shape should be on the CPU (#9421)
    
    * BUG: alloc_tensor offset and reshape shape should be on the CPU
    
    The VM ManifestAlloc pass was allocating constants in a few places I
    forgot to tag with on_device for the host/cpu. As a result the runtime
    would (silently) do the x-device copy, which destroys perf.
    
    To make this easier to spot in the future added a 'constants' property
    to the VM Executable to dump the shape & device for all VM constants.
    
    This is CORE-102 in OctoML JIRA.
    
    * [checkpoint] Older compilers can't handle << overload
    
    * [checkpoint] Woops, forgot requires_cuda
---
 include/tvm/runtime/vm/executable.h  |  7 +++++
 python/tvm/runtime/vm.py             |  7 +++++
 src/relay/transforms/memory_alloc.cc | 15 +++++----
 src/runtime/vm/executable.cc         | 30 ++++++++++++++++++
 tests/python/relay/test_vm.py        | 61 ++++++++++++++++++++++++++++++++++++
 5 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 2cdd180..6e564fd 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -132,6 +132,13 @@ class Executable : public ModuleNode {
   std::string GetBytecode() const;
 
   /*!
+   * \brief Returns a description of all the contants in the executable in human-readable
+   * format. Not intended to be machine readable, but rather to help with debugging and
+   * diffing generated code.
+   */
+  std::string GetConstants() const;
+
+  /*!
    * \brief Print the detailed statistics of the given code, i.e. number of
    * globls and constants, etc.
    */
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 8ebb0f6..c1cbc96 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -71,6 +71,7 @@ class Executable(object):
         self._save = self.mod["save"]
         self._get_lib = self.mod["get_lib"]
         self._get_bytecode = self.mod["get_bytecode"]
+        self._get_constants = self.mod["get_constants"]
         self._get_stats = self.mod["get_stats"]
         self._get_function_arity = self.mod["get_function_arity"]
         self._get_function_param_name = self.mod["get_function_param_name"]
@@ -245,6 +246,12 @@ class Executable(object):
         return self._get_bytecode()
 
     @property
+    def constants(self):
+        """Returns a human-readable description of all the constants in the executable.
+        Useful for debugging and diffing generated executables in unit tests."""
+        return self._get_constants()
+
+    @property
     def globals(self):
         """Get the globals used by the Relay VM executable.
 
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index dd582de..81d704e 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -62,8 +62,9 @@ inline Constant MakeConstant(const std::vector<int64_t>& value) {
 }
 
 inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype,
-                        Array<IndexExpr> assert_shape) {
-  auto offset = MakeConstantScalar(DataType::Int(64), 0);
+                        Array<IndexExpr> assert_shape, DLDeviceType offset_device_type) {
+  auto offset =
+      OnDevice(MakeConstantScalar(DataType::Int(64), 0), offset_device_type, /*is_fixed=*/true);
   return AllocTensor(storage, offset, shape, dtype, assert_shape);
 }
 
@@ -267,8 +268,9 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
     auto sto = scope->Push(var, value);
 
     // TODO(@jroesch): There is a bug with typing based on the constant shape.
-    auto tensor = OnDevice(AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape),
-                           dev.device_type, /*is_fixed=*/true);
+    auto tensor = OnDevice(
+        AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape, cpu_device_.device_type),
+        dev.device_type, /*is_fixed=*/true);
     Var tensor_var("tensor_" + name_hint, Type(nullptr));
     return scope->Push(tensor_var, tensor);
   }
@@ -367,7 +369,8 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
       auto out_shape = out_shapes[i];
       auto out_type = out_types[i];
       auto storage = storages[i];
-      auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape),
+      auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape,
+                                        cpu_device_.device_type),
                             dev.device_type, /*is_fixed=*/true);
       Var out_var("out_" + std::to_string(i), Type(nullptr));
       outs.push_back(scope->Push(out_var, alloc));
@@ -394,7 +397,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
         CHECK(imm) << "expect static int shape";
         shape.push_back(imm->value);
       }
-      shape_expr = MakeConstant(shape);
+      shape_expr = OnDevice(MakeConstant(shape), cpu_device_.device_type, /*is_fixed=*/true);
     }
     return ReshapeTensor(new_args[0], shape_expr, ret_ty->shape);
   }
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 15d2aa0..4d7ee45 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -61,6 +61,8 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr<Obje
   } else if (name == "get_bytecode") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetBytecode(); });
+  } else if (name == "get_constants") {
+    return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetConstants(); });
   } else if (name == "get_stats") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->Stats(); });
   } else if (name == "save") {
@@ -146,6 +148,34 @@ std::string Executable::GetBytecode() const {
   return oss.str();
 }
 
+namespace {
+String ShapeString(const ShapeTuple& shape_tuple, DLDataType dtype) {
+  std::stringstream sizes;
+  sizes << DLDataType2String(dtype) << "[";
+  for (size_t i = 0; i < shape_tuple.size(); i++) {
+    if (i != 0) {
+      sizes << ", ";
+    }
+    sizes << shape_tuple.data()[i];
+  }
+  sizes << "]";
+  return String(sizes.str());
+}
+}  // namespace
+
+std::string Executable::GetConstants() const {
+  std::ostringstream oss;
+
+  for (size_t i = 0; i < constants.size(); ++i) {
+    const auto& constant = constants[i];
+    auto ndarray = Downcast<NDArray>(constant);
+    DLDeviceType device_type = static_cast<DLDeviceType>(const_device_type[i]);
+    oss << "VM Constant[" << i << "]: has shape " << ShapeString(ndarray.Shape(), ndarray->dtype)
+        << " on device of type " << device_type << std::endl;
+  }
+  return oss.str();
+}
+
 std::string Executable::Stats() const {
   std::ostringstream oss;
   oss << "Relay VM executable statistics:" << std::endl;
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 52a2fef..7997974 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -999,6 +999,67 @@ def test_shape_func_nested_function():
     compiler.lower(mod, "llvm")
 
 
+@tvm.testing.requires_cuda
+def test_storage_size_and_offset_on_cpu():
+    """Tests allocations place sizes and offsets on the CPU host even if the rest
+    of the computation is on a different device type."""
+    # TODO(mbs): Better would be to test ManifestAlloc independently.
+
+    # CPU = device type 1
+    # GPU = device type 2
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32],
+                      param_device_types=[2], result_device_type=2) {
+              add(%a, %a)
+            }
+        """
+        )
+
+    exe = relay.vm.compile(
+        input(),
+        tvm.target.Target("cuda"),
+    )
+
+    # This program needs two constants:
+    # - The size of the tensor's storage (first arg) to alloc_storage
+    # - The offset of the tensor within the storage (second arg) to alloc_tensor
+    # Both should be on the CPU
+    assert not "on device of type 2" in exe.constants
+    assert "on device of type 1" in exe.constants
+
+
+@tvm.testing.requires_cuda
+def test_reshape_shape_on_cpu():
+    """Tests the argument to a reshape places the shape on the CPU host even if the rest
+    of the computation is on a different device type."""
+    # TODO(mbs): Better would be to test ManifestAlloc independently.
+
+    # CPU = device type 1
+    # GPU = device type 2
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(2, 8), float32],
+                      param_device_types=[2], result_device_type=2) {
+              reshape(%x, newshape=[2, 4, 2])
+            }
+        """
+        )
+
+    exe = relay.vm.compile(
+        input(),
+        tvm.target.Target("cuda"),
+    )
+
+    # The newshape annotation should have been turned into a constant on the CPU.
+    assert not "on device of type 2" in exe.constants
+    assert "on device of type 1" in exe.constants
+
+
 if __name__ == "__main__":
     import sys