You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by mo...@apache.org on 2021/12/11 16:55:05 UTC
[tvm] branch main updated: Add Hexagon VTCM and discontiguous allocation support (#9525)

This is an automated email from the ASF dual-hosted git repository.

moreau pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 2b35cfd  Add Hexagon VTCM and discontiguous allocation support (#9525)
2b35cfd is described below

commit 2b35cfd6ddb73afecd3f550f33881e1fdc7c3267
Author: Adam Straw <as...@octoml.ai>
AuthorDate: Sat Dec 11 08:54:40 2021 -0800

    Add Hexagon VTCM and discontiguous allocation support (#9525)
    
    * WIP Allocation abstraction for VTCM and DDR.
    
    * Add Hexagon VTCM and discontiguous allocation support
    
    * differentiate between dimensions and allocations
    
    * remove change to llvm codegen
    
    * add integration test_add_vtcm to demo vtcm alloc
    
    * remove cmake change
    
    * forcing contiguous allocation in device API, for now
    
    Co-authored-by: Chris Sullivan <cs...@octoml.ai>
---
 src/runtime/hexagon/hexagon/hexagon_buffer.cc      | 239 ++++++++++++++++-----
 src/runtime/hexagon/hexagon/hexagon_buffer.h       |  91 +++++---
 src/runtime/hexagon/hexagon/hexagon_common.cc      |   8 +-
 src/runtime/hexagon/hexagon/hexagon_common.h       |   2 +
 .../hexagon/hexagon/hexagon_device_api_v2.cc       |  83 ++++---
 .../contrib/test_hexagon/rpc/test_launcher.py      |  48 +++++
 6 files changed, 352 insertions(+), 119 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index 38f91c8..a35759d 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -23,83 +23,151 @@
 
 #include <tvm/runtime/module.h>
 
+#include "hexagon_common.h"
+
+#if defined(__hexagon__)
+#include "HAP_compute_res.h"
+#endif
+
 #include <string>
 #include <utility>
 
-#include "hexagon_common.h"
-
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-static size_t GetDataAlignment(const DLDataType dtype) {
-  size_t align = (dtype.bits / 8) * dtype.lanes;
-  if (align < kAllocAlignment) return kAllocAlignment;
-  return align;
-}
+struct Allocation {
+  Allocation(size_t nbytes, size_t alignment) : nbytes_(nbytes), alignment_(alignment) {}
+  virtual ~Allocation() {}
+  Allocation(const Allocation&) = delete;
+  Allocation& operator=(const Allocation&) = delete;
+  Allocation(Allocation&&) = delete;
+  Allocation& operator=(Allocation&&) = delete;
 
-HexagonBuffer::HexagonBuffer(int ndim, const int64_t* shape, DLDataType dtype,
-                             Optional<String> scope) {
-  // TODO(csullivan): Re-enable check on ndim <= 2 when physical layout support
-  // in MakePackedAPI is added.
-  // ICHECK_LE(ndim, 1) << "Hexagon currently only supports flat allocations "
-  //                    << "and arrays of flat allocations.";
-
-  DLTensor t;
-  t.shape = const_cast<int64_t*>(shape);
-  t.ndim = ndim;
-  t.dtype = dtype;
-  size_t nbytes = GetDataSize(t);
-  size_t alignment = GetDataAlignment(dtype);
-  // TODO(csullivan): Extend to support arrays of allocations.
-  // Move assignment from r-value constructed flat allocation.
-  *this = HexagonBuffer(nbytes, alignment, scope);
-}
+  void* data_{nullptr};
+  size_t nbytes_;
+  size_t alignment_;
+};
 
-HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope) {
-  void* ptr = nullptr;
-  int ret = posix_memalign(&ptr, alignment, nbytes);
-  if (ret != 0) {
-    throw std::bad_alloc();
+struct DDRAllocation : public Allocation {
+  DDRAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
+#ifdef _WIN32
+    data_ = _aligned_malloc(nbytes, alignment);
+    CHECK(data_ != nullptr);
+#else
+    int ret = posix_memalign(&data_, alignment, nbytes);
+    CHECK_EQ(ret, 0);
+#endif
   }
-  allocations_.push_back(ptr);
-  SetStorageScope(scope);
+  ~DDRAllocation() {
+#ifdef _WIN32
+    _aligned_free(data_);
+#else
+    free(data_);
+#endif
+  }
+};
+
+#if defined(__hexagon__)
+struct VTCMAllocation : public Allocation {
+  VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
+    compute_res_attr_t res_info;
+    HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
+
+    // allocate nbytes of vtcm on a single page
+    HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes,
+                                                          /*b_single_page = */ 1));
+    context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 10000);
+
+    if (context_id_) {
+      data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
+      if (!data_) {
+        HEXAGON_PRINT(ERROR, "ERROR: Allocated VTCM ptr is null.");
+        HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
+        return;
+      }
+    } else {
+      HEXAGON_PRINT(ERROR, "ERROR: Unable to acquire requeisted resource.");
+      return;
+    }
+    // HEXAGON_PRINT(ALWAYS, "VTCMAllocation() - Context ID: %u, VTCM ptr: %p", context_id_, data_);
+  }
+  ~VTCMAllocation() {
+    // HEXAGON_PRINT(ALWAYS, "~VTCMAllocation() - Context ID: %u, VTCM ptr: %p", context_id_,
+    // data_);
+    HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
+    data_ = nullptr;
+  }
+  unsigned int context_id_{0};
+};
+#else
+struct VTCMAllocation : public DDRAllocation {
+  VTCMAllocation(size_t nbytes, size_t alignment) : DDRAllocation(nbytes, alignment) {}
+};
+#endif
+
+template <HexagonBuffer::StorageScope S>
+std::unique_ptr<Allocation> Allocator(size_t nbytes, size_t alignment);
+
+template <>
+std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kDDR>(size_t nbytes,
+                                                                         size_t alignment) {
+  return std::make_unique<DDRAllocation>(nbytes, alignment);
 }
 
-HexagonBuffer::HexagonBuffer(void* data, Optional<String> scope) : managed_{false} {
+template <>
+std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kVTCM>(size_t nbytes,
+                                                                          size_t alignment) {
+  return std::make_unique<VTCMAllocation>(nbytes, alignment);
+}
+
+HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope)
+    : nallocs_(1), nbytes_(nbytes) {
   SetStorageScope(scope);
-  allocations_.push_back(data);
+
+  std::unique_ptr<Allocation> alloca = nullptr;
+  if (GetStorageScope() == StorageScope::kDDR) {
+    alloca = Allocator<StorageScope::kDDR>(nbytes, alignment);
+  } else if (GetStorageScope() == StorageScope::kVTCM) {
+    alloca = Allocator<StorageScope::kVTCM>(nbytes, alignment);
+  }
+  CHECK(alloca != nullptr);
+  allocations_.push_back(alloca->data_);
+  managed_allocations_.push_back(std::move(alloca));
 }
 
-HexagonBuffer::~HexagonBuffer() {
-  if (managed_) {
-    for (auto& ptr : allocations_) {
-      free(ptr);
+HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
+                             Optional<String> scope)
+    : nallocs_(nallocs), nbytes_(nallocs * nbytes) {
+  SetStorageScope(scope);
+  for (size_t i = 0; i < nallocs; ++i) {
+    std::unique_ptr<Allocation> alloca = nullptr;
+    if (GetStorageScope() == StorageScope::kDDR) {
+      alloca = Allocator<StorageScope::kDDR>(nbytes, alignment);
+    } else if (GetStorageScope() == StorageScope::kVTCM) {
+      alloca = Allocator<StorageScope::kVTCM>(nbytes, alignment);
     }
+    CHECK(alloca != nullptr);
+    allocations_.push_back(alloca->data_);
+    managed_allocations_.push_back(std::move(alloca));
   }
 }
 
-HexagonBuffer::HexagonBuffer(HexagonBuffer&& other)
-    : allocations_(other.allocations_),
-      managed_(other.managed_),
-      storage_scope_(other.storage_scope_) {
-  other.allocations_.clear();
-  other.managed_ = false;
-  other.storage_scope_ = StorageScope::kDDR;
+HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional<String> scope)
+    : nallocs_(1), nbytes_(nbytes) {
+  SetStorageScope(scope);
+  // disallow external VTCM allocations
+  CHECK(GetStorageScope() != HexagonBuffer::StorageScope::kVTCM);
+  allocations_.push_back(data);
 }
 
-HexagonBuffer& HexagonBuffer::operator=(HexagonBuffer&& other) {
-  std::swap(allocations_, other.allocations_);
-  std::swap(managed_, other.managed_);
-  std::swap(storage_scope_, other.storage_scope_);
-  return *this;
-}
+HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); }
 
-void* HexagonBuffer::GetPointer() {
+void** HexagonBuffer::GetPointer() {
   if (!allocations_.size()) {
     return nullptr;
   }
-  return (allocations_.size() > 1) ? allocations_.data() : allocations_[0];
+  return allocations_.data();
 }
 
 HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; }
@@ -119,11 +187,70 @@ void HexagonBuffer::SetStorageScope(Optional<String> scope) {
   }
 }
 
-HexagonBuffer* IsHexagonBuffer(DLTensor* tensor) {
-  if (TVMDeviceExtType(tensor->device.device_type) == kDLHexagon) {
-    return static_cast<HexagonBuffer*>(tensor->data);
+void HexagonBuffer::CopyTo(void* data, size_t nbytes) {
+  CHECK(nbytes_ == nbytes);
+  size_t offset = 0;
+  for (size_t i = 0; i < nallocs_; ++i) {
+    CHECK(nbytes / nallocs_ == managed_allocations_[i]->nbytes_);
+
+    memcpy(static_cast<char*>(data) + offset,
+           static_cast<const char*>(managed_allocations_[i]->data_),
+           managed_allocations_[i]->nbytes_);
+
+    offset += managed_allocations_[i]->nbytes_;
+  }
+}
+
+void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
+  CHECK(nbytes_ == nbytes);
+  size_t offset = 0;
+  for (size_t i = 0; i < nallocs_; ++i) {
+    CHECK(nbytes / nallocs_ == managed_allocations_[i]->nbytes_);
+
+    memcpy(static_cast<char*>(managed_allocations_[i]->data_),
+           static_cast<const char*>(data) + offset, managed_allocations_[i]->nbytes_);
+
+    offset += managed_allocations_[i]->nbytes_;
+  }
+}
+
+void HexagonBuffer::CopyFrom(const HexagonBuffer& other) {
+  CHECK(nbytes_ == other.nbytes_);
+
+  if (nallocs_ == other.nallocs_) {
+    for (size_t i = 0; i < nallocs_; ++i) {
+      CHECK(managed_allocations_[i]->nbytes_ == other.managed_allocations_[i]->nbytes_);
+
+      memcpy(static_cast<char*>(managed_allocations_[i]->data_),
+             static_cast<const char*>(other.managed_allocations_[i]->data_),
+             managed_allocations_[i]->nbytes_);
+    }
+  } else if (nallocs_ == 1) {
+    size_t offset = 0;
+    for (size_t i = 0; i < other.nallocs_; ++i) {
+      CHECK(nbytes_ / other.nallocs_ == other.managed_allocations_[i]->nbytes_);
+
+      memcpy(static_cast<char*>(managed_allocations_[0]->data_) + offset,
+             static_cast<const char*>(other.managed_allocations_[i]->data_),
+             other.managed_allocations_[i]->nbytes_);
+
+      offset += other.managed_allocations_[i]->nbytes_;
+    }
+  } else if (other.nallocs_ == 1) {
+    size_t offset = 0;
+    for (size_t i = 0; i < nallocs_; ++i) {
+      CHECK(other.nbytes_ / nallocs_ == managed_allocations_[i]->nbytes_);
+
+      memcpy(static_cast<char*>(managed_allocations_[i]->data_),
+             static_cast<const char*>(other.managed_allocations_[0]->data_) + offset,
+             managed_allocations_[i]->nbytes_);
+
+      offset += managed_allocations_[i]->nbytes_;
+    }
+  } else {
+    CHECK(false) << "To copy between Hexagon Buffers they must either have the same number of "
+                    "dimensions or one of the Hexagon Buffers must have a single dimension.";
   }
-  return nullptr;
 }
 
 }  // namespace hexagon
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h
index c62cee6..139c38e 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.h
@@ -26,36 +26,38 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 
+#include <memory>
 #include <vector>
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
+struct Allocation;
+
 class HexagonBuffer {
  public:
-  /* \brief Allocate memory within hexagon accessible memory
-   * scopes.
+  /* \brief Allocate 1d (contiguous) memory within Hexagon accessible
+   * memory scopes.
    *
-   * \param ndim The number of dimensions of physical storage
+   * \param nbytes The number of bytes of physical storage
    * to allocate.
    *
-   * \param shape The shape of the ndarray for which to allocate
-   * physical storage.
-   *
-   * \param dtype The data type of the physical storage.
+   * \param alignment The byte alignment to be used when allocating.
    *
    * \param scope Optional storage scope indicating the memory
    * space in which to allocate. Defaults to global system
    * memory (DDR).
    */
-  HexagonBuffer(int ndim, const int64_t* shape, DLDataType dtype, Optional<String> scope);
+  HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope);
 
-  /* \brief Allocate memory within hexagon accessible memory
-   * scopes.
+  /* \brief Allocate 2d (discontiguous) memory within Hexagon accessible
+   * memory scopes.
    *
-   * \param nbytes The number of bytes of flat physical storage
-   * to allocate.
+   * \param nallocs The number of allocations.
+   *
+   * \param nbytes The number of bytes of physical storage
+   * to allocate per allocation.
    *
    * \param alignment The byte alignment to be used when allocating.
    *
@@ -63,17 +65,19 @@ class HexagonBuffer {
    * space in which to allocate. Defaults to global system
    * memory (DDR).
    */
-  HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope);
+  HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment, Optional<String> scope);
 
-  /* \brief Construct a hexagon buffer from externally allocated storage.
+  /* \brief Construct a Hexagon Buffer from an external buffer.
+   *
+   * \param data The pointer to the external buffer.
    *
-   * \param data The externally allocated storage.
+   * \param nbytes The size of the external buffer in bytes.
    *
    * \param scope Optional storage scope indicating the memory
-   * space in the external allocation belongs. Assumes global system
-   * memory if not provided.
+   * space in which to allocate. Defaults to global system
+   * memory (DDR).
    */
-  explicit HexagonBuffer(void* data, Optional<String> scope = Optional<String>());
+  explicit HexagonBuffer(void* data, size_t nbytes, Optional<String> scope);
 
   //! \brief Destruction deallocates the underlying allocations.
   ~HexagonBuffer();
@@ -84,14 +88,14 @@ class HexagonBuffer {
   //! \brief Prevent copy assignment with HexagonBuffers.
   HexagonBuffer& operator=(const HexagonBuffer&) = delete;
 
-  //! \brief Allow move construction.
-  HexagonBuffer(HexagonBuffer&&);
+  //! \brief Prevent move construction.
+  HexagonBuffer(HexagonBuffer&&) = delete;
 
-  //! \brief Allow move assignment.
-  HexagonBuffer& operator=(HexagonBuffer&&);
+  //! \brief Prevent move assignment.
+  HexagonBuffer& operator=(HexagonBuffer&&) = delete;
 
-  //! \brief Return pointer to allocation or allocations.
-  void* GetPointer();
+  //! \brief Return pointer to allocations.
+  void** GetPointer();
 
   //! \brief Memory scopes managed by a Hexagon Buffer.
   enum class StorageScope {
@@ -106,28 +110,49 @@ class HexagonBuffer {
   //! \brief Return storage scope of underlying allocation.
   StorageScope GetStorageScope() const;
 
+  /* \brief Copy data from a Hexagon Buffer an external buffer.
+   *
+   * \param data The pointer to the external buffer.
+   *
+   * \param nbytes The number of bytes to copy.
+   */
+  void CopyTo(void* data, size_t nbytes);
+
+  /* \brief Copy data from an external buffer to a Hexagon Buffer.
+   *
+   * \param data The pointer to the external buffer.
+   *
+   * \param nbytes The number of bytes to copy.
+   */
+  void CopyFrom(void* data, size_t nbytes);
+
+  /* \brief Copy data from one Hexagon Buffer to another.
+   *
+   * \param other The other Hexagon Buffer.
+   */
+  void CopyFrom(const HexagonBuffer& other);
+
  private:
   //! \brief Assign a storage scope to the buffer.
   void SetStorageScope(Optional<String> scope);
-  /*! \brief Array of allocations required by the buffer.
+  /*! \brief Array of raw pointer allocations required by the buffer.
    *
-   *  For a 1d (flat) storage, a single contiguous allocation will
-   *  result. For 2d storage, (count, nbytes) = shape, which will
-   *  result in `count` discrete allocations.
+   *  For 1d (contiguous) storage a single allocation will result.
+   *  For 2d (discontiguous) storage `nallocs` allocations will result.
    */
   std::vector<void*> allocations_;
-  /*! \brief Whether the allocation(s) present are managed
-   *  and should be deallocated upon destruction.
+  /*! \brief Managed allocations which follow RAII and are released
+   *  during destruction.
    */
-  bool managed_{true};
+  std::vector<std::unique_ptr<Allocation>> managed_allocations_;
   /*! \brief The underlying storage type in which the allocation
    *  resides.
    */
+  size_t nallocs_;
+  size_t nbytes_;
   StorageScope storage_scope_;
 };
 
-HexagonBuffer* IsHexagonBuffer(DLTensor* tensor);
-
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index bed6f45..6927cd5 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -59,7 +59,9 @@ void HexagonLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
   std::vector<int64_t> shape_vec{template_tensor->shape,
                                  template_tensor->shape + template_tensor->ndim};
 
-  auto* param_buffer = new HexagonBuffer(static_cast<void*>(opaque_handle));
+  Optional<String> scope("global");
+  auto* param_buffer =
+      new HexagonBuffer(static_cast<void*>(opaque_handle), GetDataSize(*template_tensor), scope);
   auto* container = new NDArray::Container(static_cast<void*>(param_buffer), shape_vec,
                                            template_tensor->dtype, dev);
   container->SetDeleter([](Object* container) {
@@ -85,7 +87,9 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
       if (args.type_codes[i] == kTVMDLTensorHandle) {
         DLTensor* tensor = static_cast<DLTensor*>(arg_values[i].v_handle);
         buffer_args.emplace_back(i, static_cast<HexagonBuffer*>(tensor->data));
-        tensor->data = buffer_args.back().second->GetPointer();
+        // Assumes a single contiguous allocation
+        // TODO(Straw): Enable discontiguous allocation after RFC 39 lands
+        tensor->data = buffer_args.back().second->GetPointer()[0];
       }
     }
     int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.h b/src/runtime/hexagon/hexagon/hexagon_common.h
index 87d36c9..e1eca72 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.h
+++ b/src/runtime/hexagon/hexagon/hexagon_common.h
@@ -62,4 +62,6 @@ inline bool IsHexagonDevice(DLDevice dev) {
   return TVMDeviceExtType(dev.device_type) == kDLHexagon;
 }
 
+constexpr int kHexagonAllocAlignment = 2048;
+
 #endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_COMMON_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index 5b79f80..4461ac5 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -51,63 +51,90 @@ void HexagonDeviceAPIv2::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* r
   }
 }
 
+// DataSpace: static allocations for Hexagon
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* shape,
                                          DLDataType dtype, Optional<String> mem_scope) {
-  return new HexagonBuffer(ndim, shape, dtype, mem_scope.defined() ? mem_scope : String("global"));
+  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon);
+
+  // Forcing contiguous allocation, for now
+  // TODO(Straw): Enable discontiguous allocation after RFC 39 lands
+  size_t nallocs = 1;
+  size_t nbytes = 1;
+  for (int i = 0; i < ndim; ++i) {
+    nbytes *= shape[i];
+  }
+  size_t typesize = (dtype.bits / 8) * dtype.lanes;
+  nbytes *= typesize;
+
+  size_t alignment = typesize;
+  if (alignment < kHexagonAllocAlignment) {
+    alignment = kHexagonAllocAlignment;
+  }
+  return new HexagonBuffer(nallocs, nbytes, alignment, mem_scope);
 }
 
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
                                          DLDataType type_hint) {
+  if (alignment < kHexagonAllocAlignment) {
+    alignment = kHexagonAllocAlignment;
+  }
   return new HexagonBuffer(nbytes, alignment, String("global"));
 }
 
 void HexagonDeviceAPIv2::FreeDataSpace(Device dev, void* ptr) {
-  auto* pbuf = static_cast<HexagonBuffer*>(ptr);
-  delete pbuf;
+  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon);
+  auto* hexbuf = static_cast<HexagonBuffer*>(ptr);
+  CHECK(hexbuf != nullptr);
+  delete hexbuf;
 }
 
+// WorkSpace: runtime allocations for Hexagon
 struct HexagonWorkspacePool : public WorkspacePool {
   HexagonWorkspacePool() : WorkspacePool(kDLCPU, HexagonDeviceAPIv2::Global()) {}
 };
 
 void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  auto* buffer = static_cast<HexagonBuffer*>(
+  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon);
+  auto* hexbuf = static_cast<HexagonBuffer*>(
       dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size));
-  void* ptr = buffer->GetPointer();
-  workspace_allocations_.insert({ptr, buffer});
+
+  // Assumes a single contiguous allocation
+  // TODO(Straw): Enable discontiguous allocation after RFC 39 lands
+  void* ptr = hexbuf->GetPointer()[0];
+  workspace_allocations_.insert({ptr, hexbuf});
   return ptr;
 }
 
 void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
+  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon);
   auto it = workspace_allocations_.find(data);
-  ICHECK(it != workspace_allocations_.end())
+  CHECK(it != workspace_allocations_.end())
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, it->second);
   workspace_allocations_.erase(it);
 }
 
 void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
-  if (IsHexagonDevice(from->device) && IsHexagonDevice(to->device)) {
-    HexagonBuffer* buffer_src = static_cast<HexagonBuffer*>(from->data);
-    HexagonBuffer* buffer_dst = static_cast<HexagonBuffer*>(to->data);
-    // Check storage scopes
-    if (buffer_src->GetStorageScope() == HexagonBuffer::StorageScope::kDDR &&
-        buffer_dst->GetStorageScope() == HexagonBuffer::StorageScope::kDDR) {
-      memcpy(static_cast<char*>(buffer_dst->GetPointer()) + to->byte_offset,
-             static_cast<const char*>(buffer_src->GetPointer()) + from->byte_offset,
-             GetDataSize(*from));
-    } else {
-      ICHECK(false) << "Currently only copying between DDR storage is supported.";
-    }
-  } else if (IsHexagonDevice(from->device) && to->device.device_type == kDLCPU) {
-    HexagonBuffer* buffer_src = static_cast<HexagonBuffer*>(from->data);
-    memcpy(static_cast<char*>(to->data) + to->byte_offset,
-           static_cast<const char*>(buffer_src->GetPointer()) + from->byte_offset,
-           GetDataSize(*from));
-  } else if (from->device.device_type == kDLCPU && IsHexagonDevice(to->device)) {
-    HexagonBuffer* buffer_dst = static_cast<HexagonBuffer*>(to->data);
-    memcpy(static_cast<char*>(buffer_dst->GetPointer()) + to->byte_offset,
-           static_cast<const char*>(from->data) + from->byte_offset, GetDataSize(*from));
+  CHECK_EQ(from->byte_offset, 0);
+  CHECK_EQ(to->byte_offset, 0);
+  CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
+
+  HexagonBuffer* hex_from_buf = static_cast<HexagonBuffer*>(from->data);
+  HexagonBuffer* hex_to_buf = static_cast<HexagonBuffer*>(to->data);
+
+  if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
+      TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
+    CHECK(hex_from_buf != nullptr);
+    CHECK(hex_to_buf != nullptr);
+    hex_to_buf->CopyFrom(*hex_from_buf);
+  } else if (from->device.device_type == kDLCPU &&
+             TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
+    CHECK(hex_to_buf != nullptr);
+    hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
+  } else if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
+             to->device.device_type == kDLCPU) {
+    CHECK(hex_from_buf != nullptr);
+    hex_from_buf->CopyTo(to->data, GetDataSize(*to));
   } else {
     CHECK(false)
         << "Expect copy between DLTensor devices of types kDLHexagon and kDLCPU (external) only.";
diff --git a/tests/python/contrib/test_hexagon/rpc/test_launcher.py b/tests/python/contrib/test_hexagon/rpc/test_launcher.py
index d705541..15dfac6 100644
--- a/tests/python/contrib/test_hexagon/rpc/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/rpc/test_launcher.py
@@ -76,6 +76,54 @@ def test_add(tvm_tracker_host, tvm_tracker_port, android_serial_number):
     launcher.close()
 
 
+@requires_rpc_tracker
+@requires_hexagon_toolchain
+def test_add_vtcm(tvm_tracker_host, tvm_tracker_port, android_serial_number):
+    dtype = "int8"
+    A = tvm.te.placeholder((2,), dtype=dtype)
+    B = tvm.te.placeholder((1,), dtype=dtype)
+    C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
+    sched = tvm.te.create_schedule(C.op)
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    func = tvm.build(
+        sched, [A, B, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="add"
+    )
+
+    temp = utils.tempdir()
+    dso_binary = "test_binary.so"
+    dso_binary_path = temp.relpath(dso_binary)
+    func.save(dso_binary_path)
+
+    launcher = HexagonLauncher(serial_number=android_serial_number)
+    launcher.android_run_rpc(rpc_tracker_host=tvm_tracker_host, rpc_tracker_port=tvm_tracker_port)
+    launcher.hexagon_setup()
+    remote_kw = {
+        "host": tvm_tracker_host,
+        "port": tvm_tracker_port,
+        "priority": 0,
+        "timeout": 60,
+    }
+    launcher.hexagon_session_setup(remote_kw)
+    launcher.upload(dso_binary_path, dso_binary)
+
+    with launcher.session as sess:
+        mod = launcher.get_module(dso_binary)
+        A_data = tvm.nd.empty(A.shape, A.dtype, sess.device, "global.vtcm")
+        A_data.copyfrom(np.array([2, 3]))
+
+        B_data = tvm.nd.empty(B.shape, B.dtype, sess.device, "global.vtcm")
+        B_data.copyfrom(np.array([4]))
+
+        C_data = tvm.nd.empty(C.shape, C.dtype, sess.device, "global.vtcm")
+        C_data.copyfrom(np.array([0, 0]))
+
+        mod["add"](A_data, B_data, C_data)
+        result = C_data.numpy()
+        assert (result == np.array([6, 7])).all()
+    launcher.close()
+
+
 class TestMatMul:
     M = tvm.testing.parameter(32)
     N = tvm.testing.parameter(32)