You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2021/02/28 15:52:33 UTC

[tvm] branch main updated: [Runtime] Special Memory Scope Support (#7488)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 2673309  [Runtime] Special Memory Scope Support (#7488)
2673309 is described below

commit 26733095f5a1e0887c32d644429d430bc1f51c91
Author: ziheng <zi...@apache.org>
AuthorDate: Sun Feb 28 23:52:16 2021 +0800

    [Runtime] Special Memory Scope Support (#7488)
---
 include/tvm/runtime/c_runtime_api.h       |  35 ++++--
 include/tvm/runtime/device_api.h          |  42 +++++--
 include/tvm/runtime/ndarray.h             |   7 +-
 python/tvm/runtime/ndarray.py             |  38 +++---
 src/runtime/c_runtime_api.cc              |  64 +++++++++-
 src/runtime/cpu_device_api.cc             |  13 +-
 src/runtime/crt/common/crt_runtime_api.c  |  39 +++++-
 src/runtime/cuda/cuda_device_api.cc       |   2 +
 src/runtime/hexagon/hexagon_device_api.cc |   8 +-
 src/runtime/metal/metal_common.h          |   8 +-
 src/runtime/minrpc/minrpc_server.h        | 137 ++++++++++++--------
 src/runtime/minrpc/rpc_reference.h        |  76 ++++++-----
 src/runtime/ndarray.cc                    |  65 ++++++----
 src/runtime/opencl/opencl_common.h        |   8 +-
 src/runtime/rpc/rpc_device_api.cc         |  63 +++++++---
 src/runtime/rpc/rpc_endpoint.cc           | 201 +++++++++++++++---------------
 src/runtime/rpc/rpc_endpoint.h            |   6 +-
 src/runtime/rpc/rpc_local_session.cc      |  40 +++---
 src/runtime/rpc/rpc_local_session.h       |   6 +-
 src/runtime/rpc/rpc_session.cc            |  15 +--
 src/runtime/rpc/rpc_session.h             |  43 ++-----
 src/runtime/vulkan/vulkan.cc              |   2 +
 web/emcc/tvmjs_support.cc                 |  44 ++++---
 web/emcc/webgpu_runtime.cc                |   2 +
 24 files changed, 577 insertions(+), 387 deletions(-)

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 467e69a..59316a0 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -560,6 +560,23 @@ TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignme
                                     DLDataType type_hint, void** out_data);
 
 /*!
+ * \brief Allocate a data space on device with special memory scope.
+ * \note The memory could use a special multi-dimensional memory layout.
+ *       That is why we pass shape and dtype instead of raw number of bytes.
+ * \param ctx The device context to perform operation.
+ * \param ndim The number of dimension of the tensor.
+ * \param shape The shape of the tensor.
+ * \param dtype The type of elements.
+ * \param mem_scope The memory scope of the tensor,
+ *        can be nullptr, which indicate the default global DRAM
+ * \param out_data The allocated device pointer.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                             DLDataType dtype, const char* mem_scope,
+                                             void** out_data);
+
+/*!
  * \brief Free a data space on device.
  * \param ctx The device context to perform operation.
  * \param ptr The data space.
@@ -569,22 +586,14 @@ TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr);
 
 /*!
  * \brief Copy data from one place to another.
- * \param from The source array.
- * \param from_offset The byte offeset in the from.
- * \param to The target array.
- * \param to_offset The byte offset in the to.
- * \param num_bytes The size of the memory in bytes
- * \param ctx_from The source context
- * \param ctx_to The target context
- * \param type_hint The type of elements, only neded by certain backends.
- *                  can be useful for cross device endian converison.
+ * \note This API is designed to support special memory with shape dependent layout.
+ *       We pass in DLTensor* with shape information to support these cases.
+ * \param from The source tensor.
+ * \param to The target tensor.
  * \param stream Optional stream object.
  * \return 0 when success, -1 when failure happens.
  */
-TVM_DLL int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                    size_t to_offset, size_t num_bytes, TVMContext ctx_from,
-                                    TVMContext ctx_to, DLDataType type_hint,
-                                    TVMStreamHandle stream);
+TVM_DLL int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
 
 /*!
  * \brief Check that an object is derived from another.
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index a6f5624..1276663 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -91,6 +91,17 @@ class TVM_DLL DeviceAPI {
   virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                                DLDataType type_hint) = 0;
   /*!
+   * \brief Allocate a data space on device with memory scope support.
+   * \param ctx The device context to perform operation.
+   * \param ndim The number of dimension of allocated tensor.
+   * \param shape The shape of allocated tensor.
+   * \param dtype The type of elements.
+   * \param mem_scope The memory scope of allocated tensor.
+   * \return The allocated device pointer.
+   */
+  virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                               Optional<String> mem_scope = NullOpt);
+  /*!
    * \brief Free a data space on device.
    * \param ctx The device context to perform operation.
    * \param ptr The data space.
@@ -98,20 +109,13 @@ class TVM_DLL DeviceAPI {
   virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0;
   /*!
    * \brief copy data from one place to another
+   * \note This API is designed to support special memory with shape dependent layout.
+   *       We pass in DLTensor* with shape information to support these cases.
    * \param from The source array.
-   * \param from_offset The byte offeset in the from.
    * \param to The target array.
-   * \param to_offset The byte offset in the to.
-   * \param num_bytes The size of the memory in bytes
-   * \param ctx_from The source context
-   * \param ctx_to The target context
-   * \param type_hint The type of elements, only neded by certain backends.
-   *                  can be useful for cross device endian converison.
    * \param stream Optional stream object.
    */
-  virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                              size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                              DLDataType type_hint, TVMStreamHandle stream) = 0;
+  virtual void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
   /*!
    * \brief Create a new stream of execution.
    *
@@ -194,6 +198,24 @@ class TVM_DLL DeviceAPI {
   static bool NeedSetDeviceContext(int device_type) {
     return device_type != kDLCPU && device_type != kDLMicroDev;
   }
+
+ protected:
+  /*!
+   * \brief copy data from one place to another
+   * \param from The source array.
+   * \param from_offset The byte offeset in the from.
+   * \param to The target array.
+   * \param to_offset The byte offset in the to.
+   * \param num_bytes The size of the memory in bytes
+   * \param ctx_from The source context
+   * \param ctx_to The target context
+   * \param type_hint The type of elements, only neded by certain backends.
+   *                  can be useful for cross device endian converison.
+   * \param stream Optional stream object.
+   */
+  virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                              size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                              DLDataType type_hint, TVMStreamHandle stream);
 };
 
 /*! \brief The device type bigger than this is RPC device */
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 0ff171d..a884b5c 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_NDARRAY_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/serializer.h>
@@ -133,10 +134,12 @@ class NDArray : public ObjectRef {
    * \brief Create an empty NDArray.
    * \param shape The shape of the new array.
    * \param dtype The data type of the new array.
-   * \param ctx The context of the Array.
+   * \param ctx The context of the array.
+   * \param mem_scope The memory scope of the array.
    * \return The created Array
    */
-  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
+  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+                               Optional<String> mem_scope = NullOpt);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 2f616ce..75da3d4 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -23,6 +23,7 @@ import tvm._ffi
 from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE
 from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle
 from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t
+from . import _ffi_api
 
 try:
     # pylint: disable=wrong-import-position
@@ -253,42 +254,41 @@ def numpyasarray(np_data):
     return arr, shape
 
 
-def empty(shape, dtype="float32", ctx=context(1, 0)):
+def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
     """Create an empty array given shape and device
 
     Parameters
     ----------
     shape : tuple of int
-        The shape of the array
+        The shape of the array.
 
     dtype : type or str
         The data type of the array.
 
     ctx : TVMContext
-        The context of the array
+        The context of the array.
+
+    mem_scope : Optional[str]
+        The memory scope of the array.
 
     Returns
     -------
     arr : tvm.nd.NDArray
         The array tvm supported.
     """
-    shape = c_array(tvm_shape_index_t, shape)
-    ndim = ctypes.c_int(len(shape))
-    handle = TVMArrayHandle()
+    shape_imm = []
+    for s in shape:
+        if isinstance(s, tvm.tir.IntImm):
+            shape_imm.append(s.value)
+        else:
+            shape_imm.append(int(s))
+    arr = np.array(shape_imm, "int64")
+    ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
+    shape_ptr = ctypes.cast(ptr, ctypes.c_void_p)
+    ndim = len(shape_imm)
     dtype = DataType(dtype)
-    check_call(
-        _LIB.TVMArrayAlloc(
-            shape,
-            ndim,
-            ctypes.c_int(dtype.type_code),
-            ctypes.c_int(dtype.bits),
-            ctypes.c_int(dtype.lanes),
-            ctx.device_type,
-            ctx.device_id,
-            ctypes.byref(handle),
-        )
-    )
-    return _make_array(handle, False, False)
+    arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope)
+    return arr
 
 
 def from_dlpack(dltensor):
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index b4457bf..7fd27cb 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -144,6 +144,50 @@ void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hin
   return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
 }
 
+static size_t GetDataAlignment(const DLDataType dtype) {
+  size_t align = (dtype.bits / 8) * dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
+void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                                Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value() == "global") {
+    // by default, we can always redirect to the flat memory allocations
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.ctx = ctx;
+    temp.ndim = ndim;
+    temp.dtype = dtype;
+    temp.shape = const_cast<int64_t*>(shape);
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+    size_t size = GetDataSize(temp);
+    size_t alignment = GetDataAlignment(temp.dtype);
+    return AllocDataSpace(ctx, size, alignment, dtype);
+  }
+  LOG(FATAL) << "Device does not support allocate data space with "
+             << "specified memory scope: " << mem_scope.value();
+  return nullptr;
+}
+
+void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+  // by default, we can always redirect to the flat memory copy operation.
+  size_t nbytes = GetDataSize(*from);
+  ICHECK_EQ(nbytes, GetDataSize(*to));
+
+  ICHECK(IsContiguous(*from) && IsContiguous(*to))
+      << "CopyDataFromTo only support contiguous array for now";
+  CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx,
+                 to->ctx, from->dtype, stream);
+}
+
+void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                               size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                               DLDataType type_hint, TVMStreamHandle stream) {
+  LOG(FATAL) << "Device does not support CopyDataFromTo.";
+}
+
 void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); }
 
 TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
@@ -553,19 +597,29 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
   API_END();
 }
 
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+  API_BEGIN();
+  Optional<String> scope;
+  if (mem_scope != nullptr) {
+    scope = String(std::string(mem_scope));
+  }
+  out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope);
+  API_END();
+}
+
 int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) {
   API_BEGIN();
   DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr);
   API_END();
 }
 
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                            size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                            DLDataType type_hint, TVMStreamHandle stream) {
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   API_BEGIN();
+  TVMContext ctx_from = from->ctx;
+  TVMContext ctx_to = to->ctx;
   TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to;
-  DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, num_bytes, ctx_from,
-                                             ctx_to, type_hint, stream);
+  DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream);
   API_END();
 }
 
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 146bfa8..b745be3 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -69,12 +69,6 @@ class CPUDeviceAPI final : public DeviceAPI {
 #endif
   }
 
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
-    memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
-  }
-
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {}
 
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
@@ -86,6 +80,13 @@ class CPUDeviceAPI final : public DeviceAPI {
     static auto* inst = new CPUDeviceAPI();
     return inst;
   }
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final {
+    memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
+  }
 };
 
 struct CPUWorkspacePool : public WorkspacePool {
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index bc47f99..c2eb1ff 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -22,6 +22,7 @@
 #include <assert.h>
 #include <inttypes.h>
 #include <stdarg.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -87,16 +88,44 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
   if (alignment != 1) {
     nbytes = (nbytes + alignment - 1) / alignment * alignment;
   }
-
   return TVMPlatformMemoryAllocate(nbytes, ctx, out_data);
 }
 
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+                                     DLDataType dtype, const char* mem_scope, void** out_data) {
+  size_t nbytes = 1;
+  for (int i = 0; i < ndim; ++i) {
+    nbytes *= shape[i];
+  }
+  nbytes *= (dtype.bits * dtype.lanes + 7) / 8;
+
+  int kAllocAlignment = 128;
+  size_t align = (dtype.bits / 8) * dtype.lanes;
+  if (align < kAllocAlignment) align = kAllocAlignment;
+  return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data);
+}
+
 int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); }
 
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                            size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                            DLDataType type_hint, TVMStreamHandle stream) {
-  memcpy(((uint8_t*)to) + to_offset, ((uint8_t*)from) + from_offset, num_bytes);
+static bool IsContiguous(const DLTensor* arr) {
+  if (arr->strides == NULL) return true;
+  int64_t expected_stride = 1;
+  for (int32_t i = arr->ndim; i != 0; --i) {
+    int32_t k = i - 1;
+    if (arr->strides[k] != expected_stride) return false;
+    expected_stride *= arr->shape[k];
+  }
+  return true;
+}
+
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+  assert(IsContiguous(from) && IsContiguous(to));
+  size_t size = 1;
+  for (int i = 0; i < from->ndim; ++i) {
+    size *= from->shape[i];
+  }
+  size *= (from->dtype.bits * from->dtype.lanes + 7) / 8;
+  memcpy(((uint8_t*)to->data) + to->byte_offset, ((uint8_t*)from->data) + from->byte_offset, size);
   return 0;
 }
 
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 30abfc8..c773954 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -127,6 +127,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
   }
 
+ protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
@@ -166,6 +167,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
   }
 
+ public:
   TVMStreamHandle CreateStream(TVMContext ctx) {
     CUDA_CALL(cudaSetDevice(ctx.device_id));
     cudaStream_t retval;
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 605c55e..70cebf5 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -35,9 +35,6 @@ class HexagonDeviceAPI : public DeviceAPI {
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
-                      DLDataType type_hint, TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}) final;
   void FreeWorkspace(TVMContext ctx, void* ptr) final;
@@ -48,6 +45,11 @@ class HexagonDeviceAPI : public DeviceAPI {
     static HexagonDeviceAPI* inst = new HexagonDeviceAPI();
     return inst;
   }
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                      DLDataType type_hint, TVMStreamHandle stream) final;
 };
 
 // HexagonDeviceAPI.
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index d13ac7e..bd07dbf 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -84,14 +84,16 @@ class MetalWorkspace final : public DeviceAPI {
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
   // get the global workspace
   static MetalWorkspace* Global();
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size,
+                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final;
 };
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index d28e0c3..d5c61ec 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -169,28 +169,39 @@ class MinRPCServer {
   }
 
   void HandleCopyFromRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-
-    this->Read(&handle);
-    this->Read(&offset);
+    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    this->Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    this->Read(&(arr->ctx));
+    this->Read(&(arr->ndim));
+    this->Read(&(arr->dtype));
+    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+    this->ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    this->Read(&(arr->byte_offset));
+
+    uint64_t num_bytes;
     this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
 
     uint8_t* data_ptr;
     int call_ecode = 0;
-    if (ctx.device_type == kDLCPU) {
-      data_ptr = reinterpret_cast<uint8_t*>(handle) + offset;
+    if (arr->ctx.device_type == kDLCPU) {
+      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
     } else {
       data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
-      call_ecode =
-          TVMDeviceCopyDataFromTo(reinterpret_cast<void*>(handle), offset, data_ptr, 0, num_bytes,
-                                  ctx, DLContext{kDLCPU, 0}, type_hint, nullptr);
+      DLTensor temp;
+      temp.data = reinterpret_cast<void*>(data_ptr);
+      temp.ctx = arr->ctx;
+      temp.ndim = arr->ndim;
+      temp.dtype = arr->dtype;
+      temp.shape = arr->shape;
+      temp.strides = nullptr;
+      temp.byte_offset = 0;
+      call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr);
       // need sync to make sure that the copy is completed.
       if (call_ecode == 0) {
-        call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+        call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
       }
     }
 
@@ -209,30 +220,39 @@ class MinRPCServer {
   }
 
   void HandleCopyToRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-
-    this->Read(&handle);
-    this->Read(&offset);
+    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    this->Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    this->Read(&(arr->ctx));
+    this->Read(&(arr->ndim));
+    this->Read(&(arr->dtype));
+    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+    this->ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    this->Read(&(arr->byte_offset));
+    uint64_t num_bytes;
     this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
-    int call_ecode = 0;
 
-    if (ctx.device_type == kDLCPU) {
-      uint8_t* dptr = reinterpret_cast<uint8_t*>(handle) + offset;
+    int call_ecode = 0;
+    if (arr->ctx.device_type == kDLCPU) {
+      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
       this->ReadArray(dptr, num_bytes);
     } else {
       uint8_t* temp_data = this->ArenaAlloc<uint8_t>(num_bytes);
       this->ReadArray(temp_data, num_bytes);
-
-      call_ecode =
-          TVMDeviceCopyDataFromTo(temp_data, 0, reinterpret_cast<void*>(handle), offset, num_bytes,
-                                  DLContext{kDLCPU, 0}, ctx, type_hint, nullptr);
+      DLTensor temp;
+      temp.data = temp_data;
+      temp.ctx = DLContext{kDLCPU, 0};
+      temp.ndim = arr->ndim;
+      temp.dtype = arr->dtype;
+      temp.shape = arr->shape;
+      temp.strides = nullptr;
+      temp.byte_offset = 0;
+      call_ecode = TVMDeviceCopyDataFromTo(&temp, arr, nullptr);
       // need sync to make sure that the copy is completed.
       if (call_ecode == 0) {
-        call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+        call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
       }
     }
 
@@ -269,6 +289,10 @@ class MinRPCServer {
         this->SyscallDevAllocData(values, tcodes, num_args);
         break;
       }
+      case RPCCode::kDevAllocDataWithScope: {
+        this->SyscallDevAllocDataWithScope(values, tcodes, num_args);
+        break;
+      }
       case RPCCode::kDevFreeData: {
         this->SyscallDevFreeData(values, tcodes, num_args);
         break;
@@ -342,34 +366,20 @@ class MinRPCServer {
   }
 
   void SyscallCopyAmongRemote(TVMValue* values, int* tcodes, int num_args) {
-    MINRPC_CHECK(num_args == 9);
-    // from, from_offset
-    MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle);
-    MINRPC_CHECK(tcodes[1] == kDLInt);
-    // to, to_offset
+    MINRPC_CHECK(num_args == 3);
+    // from dltensor
+    MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
+    // to dltensor
+    MINRPC_CHECK(tcodes[1] == kTVMDLTensorHandle);
+    // stream
     MINRPC_CHECK(tcodes[2] == kTVMOpaqueHandle);
-    MINRPC_CHECK(tcodes[3] == kDLInt);
-    // size
-    MINRPC_CHECK(tcodes[4] == kDLInt);
-    // ctx_from, ctx_to
-    MINRPC_CHECK(tcodes[5] == kTVMContext);
-    MINRPC_CHECK(tcodes[6] == kTVMContext);
-    // type_hint, stream
-    MINRPC_CHECK(tcodes[7] == kTVMDataType);
-    MINRPC_CHECK(tcodes[8] == kTVMOpaqueHandle);
 
     void* from = values[0].v_handle;
-    int64_t from_offset = values[1].v_int64;
-    void* to = values[2].v_handle;
-    int64_t to_offset = values[3].v_int64;
-    int64_t size = values[4].v_int64;
-    TVMContext ctx_from = values[5].v_ctx;
-    TVMContext ctx_to = values[6].v_ctx;
-    DLDataType type_hint = values[7].v_type;
-    TVMStreamHandle stream = values[8].v_handle;
-
-    int call_ecode = TVMDeviceCopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
-                                             ctx_to, type_hint, stream);
+    void* to = values[1].v_handle;
+    TVMStreamHandle stream = values[2].v_handle;
+
+    int call_ecode = TVMDeviceCopyDataFromTo(reinterpret_cast<DLTensor*>(from),
+                                             reinterpret_cast<DLTensor*>(to), stream);
 
     if (call_ecode == 0) {
       this->ReturnVoid();
@@ -400,6 +410,23 @@ class MinRPCServer {
     }
   }
 
+  void SyscallDevAllocDataWithScope(TVMValue* values, int* tcodes, int num_args) {
+    MINRPC_CHECK(num_args == 2);
+    MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
+    MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr);
+
+    DLTensor* arr = reinterpret_cast<DLTensor*>(values[0].v_handle);
+    const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str);
+    void* handle;
+    int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->ctx, arr->ndim, arr->shape, arr->dtype,
+                                                      mem_scope, &handle);
+    if (call_ecode == 0) {
+      this->ReturnHandle(handle);
+    } else {
+      this->ReturnLastTVMError();
+    }
+  }
+
   void SyscallDevFreeData(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 2);
     MINRPC_CHECK(tcodes[0] == kTVMContext);
diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h
index e195b9c..07d13a7 100644
--- a/src/runtime/minrpc/rpc_reference.h
+++ b/src/runtime/minrpc/rpc_reference.h
@@ -28,7 +28,7 @@ namespace tvm {
 namespace runtime {
 
 /*! \brief The current RPC procotol version. */
-constexpr const char* kRPCProtocolVer = "0.7.0";
+constexpr const char* kRPCProtocolVer = "0.8.0";
 
 /*! \brief The RPC code */
 enum class RPCCode : int {
@@ -51,6 +51,7 @@ enum class RPCCode : int {
   kDevFreeData,
   kDevStreamSync,
   kCopyAmongRemote,
+  kDevAllocDataWithScope,
 };
 
 /*!
@@ -107,6 +108,8 @@ inline const char* RPCCodeToString(RPCCode code) {
       return "kDevStreamSync";
     case RPCCode::kCopyAmongRemote:
       return "kCopyAmongRemote";
+    case RPCCode::kDevAllocDataWithScope:
+      return "kDevAllocDataWithScope";
     default:
       return "";
   }
@@ -218,6 +221,44 @@ struct RPCReference {
     return getter.num_bytes();
   }
 
+  template <typename TChannelPtr>
+  static void SendDLTensor(TChannelPtr channel, DLTensor* arr) {
+    TVMContext ctx;
+    uint64_t data;
+    // When we return NDArray, we directly return
+    // the space and the context
+    // The client will be further wrapping
+    ctx = arr->ctx;
+    data = reinterpret_cast<uint64_t>(arr->data);
+    channel->Write(data);
+    channel->Write(ctx);
+    channel->Write(arr->ndim);
+    channel->Write(arr->dtype);
+    channel->WriteArray(arr->shape, arr->ndim);
+    if (arr->strides != nullptr) {
+      channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride);
+    }
+    channel->Write(arr->byte_offset);
+    return;
+  }
+
+  template <typename TChannelPtr>
+  static DLTensor* ReceiveDLTensor(TChannelPtr channel) {
+    uint64_t handle;
+    channel->Read(&handle);
+    DLTensor* arr = channel->template ArenaAlloc<DLTensor>(1);
+    DLTensor& tensor = *arr;
+    tensor.data = reinterpret_cast<void*>(handle);
+    channel->Read(&(tensor.ctx));
+    channel->Read(&(tensor.ndim));
+    channel->Read(&(tensor.dtype));
+    tensor.shape = channel->template ArenaAlloc<int64_t>(tensor.ndim);
+    channel->ReadArray(tensor.shape, tensor.ndim);
+    tensor.strides = nullptr;
+    channel->Read(&(tensor.byte_offset));
+    return arr;
+  }
+
   /*!
    * \brief Send packed argument sequnce to the other peer.
    *
@@ -292,24 +333,7 @@ struct RPCReference {
         }
         case kTVMDLTensorHandle: {
           DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
-          TVMContext ctx;
-          uint64_t data;
-          // When we return NDArray, we directly return
-          // the space and the context
-          // The client will be further wrapping
-          ctx = arr->ctx;
-          data = reinterpret_cast<uint64_t>(arr->data);
-          channel->Write(data);
-          channel->Write(ctx);
-          channel->Write(arr->ndim);
-          channel->Write(arr->dtype);
-          channel->WriteArray(arr->shape, arr->ndim);
-          if (arr->strides != nullptr) {
-            channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride);
-          }
-          if (arr->byte_offset != 0) {
-            channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldByteOffset);
-          }
+          SendDLTensor(channel, arr);
           break;
         }
         case kTVMNullptr:
@@ -422,19 +446,7 @@ struct RPCReference {
           break;
         }
         case kTVMDLTensorHandle: {
-          uint64_t handle;
-          channel->Read(&handle);
-          DLTensor* arr = channel->template ArenaAlloc<DLTensor>(1);
-          DLTensor& tensor = *arr;
-          tensor.data = reinterpret_cast<void*>(handle);
-          channel->Read(&(tensor.ctx));
-          channel->Read(&(tensor.ndim));
-          channel->Read(&(tensor.dtype));
-          tensor.shape = channel->template ArenaAlloc<int64_t>(tensor.ndim);
-          channel->ReadArray(tensor.shape, tensor.ndim);
-          tensor.strides = nullptr;
-          tensor.byte_offset = 0;
-          value.v_handle = arr;
+          value.v_handle = ReceiveDLTensor(channel);
           break;
         }
         default: {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index dae7756..d3ddbf8 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -24,6 +24,7 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/support/logging.h>
 
 #include "runtime_base.h"
@@ -58,36 +59,39 @@ inline void VerifyDataType(DLDataType dtype) {
   ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
-inline size_t GetDataAlignment(const DLTensor& arr) {
-  size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
-  if (align < kAllocAlignment) return kAllocAlignment;
-  return align;
-}
-
 void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
   ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
   ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
-  DeviceAPI::Get(handle->ctx)
-      ->CopyDataFromTo(data, 0, handle->data, static_cast<size_t>(handle->byte_offset), nbytes,
-                       cpu_ctx, handle->ctx, handle->dtype, nullptr);
+
+  DLTensor from;
+  from.data = const_cast<void*>(data);
+  from.ctx = DLContext{kDLCPU, 0};
+  from.ndim = handle->ndim;
+  from.dtype = handle->dtype;
+  from.shape = handle->shape;
+  from.strides = nullptr;
+  from.byte_offset = 0;
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(&from, handle, nullptr);
   // Synchronize in case data become unavailable later.
   DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
 }
 
 void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
   size_t arr_size = GetDataSize(*handle);
   ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
   ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
-  DeviceAPI::Get(handle->ctx)
-      ->CopyDataFromTo(handle->data, static_cast<size_t>(handle->byte_offset), data, 0, nbytes,
-                       handle->ctx, cpu_ctx, handle->dtype, nullptr);
+
+  DLTensor to;
+  to.data = const_cast<void*>(data);
+  to.ctx = DLContext{kDLCPU, 0};
+  to.ndim = handle->ndim;
+  to.dtype = handle->dtype;
+  to.shape = handle->shape;
+  to.strides = nullptr;
+  to.byte_offset = 0;
+
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(const_cast<DLTensor*>(handle), &to, nullptr);
   // Synchronize in case data become unavailable later.
   DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
 }
@@ -186,13 +190,11 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
 
 DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); }
 
-NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
+NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+                       Optional<String> mem_scope) {
   NDArray ret = Internal::Create(shape, dtype, ctx);
-  // setup memory content
-  size_t size = GetDataSize(ret.get_mutable()->dl_tensor);
-  size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor);
-  ret.get_mutable()->dl_tensor.data =
-      DeviceAPI::Get(ret->ctx)->AllocDataSpace(ret->ctx, size, alignment, ret->dtype);
+  ret.get_mutable()->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace(
+      ret->ctx, shape.size(), shape.data(), ret->dtype, mem_scope);
   return ret;
 }
 
@@ -236,9 +238,7 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str
   // api manager.
   TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
 
-  DeviceAPI::Get(ctx)->CopyDataFromTo(from->data, static_cast<size_t>(from->byte_offset), to->data,
-                                      static_cast<size_t>(to->byte_offset), from_size, from->ctx,
-                                      to->ctx, from->dtype, stream);
+  DeviceAPI::Get(ctx)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
 }
 
 std::vector<int64_t> NDArray::Shape() const { return get_mutable()->shape_; }
@@ -279,6 +279,17 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
   API_END();
 }
 
+TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args, TVMRetValue* ret) {
+  int64_t* shape_ptr = static_cast<int64_t*>(static_cast<void*>(args[0]));
+  int ndim = args[1];
+  std::vector<int64_t> shape(shape_ptr, shape_ptr + ndim);
+  DataType dtype = args[2];
+  TVMContext ctx = args[3];
+  Optional<String> mem_scope = args[4];
+  auto ndarray = NDArray::Empty(shape, dtype, ctx, mem_scope);
+  *ret = ndarray;
+});
+
 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();
   NDArray::Internal::FFIDecRef(handle);
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index fa118ed..2e7f05f 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -232,9 +232,6 @@ class OpenCLWorkspace : public DeviceAPI {
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(TVMContext ctx, void* data) final;
@@ -246,6 +243,11 @@ class OpenCLWorkspace : public DeviceAPI {
 
   // get the global workspace
   static OpenCLWorkspace* Global();
+
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+                      TVMStreamHandle stream) final;
 };
 
 /*! \brief Thread local workspace */
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index a1e96e9..06737f9 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -43,6 +43,18 @@ class RPCDeviceAPI final : public DeviceAPI {
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv);
   }
 
+  void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                       Optional<String> mem_scope) final {
+    auto sess = GetSess(ctx);
+    auto remote_ctx = RemoveRPCSessionMask(ctx);
+    void* data =
+        sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, ndim, shape, dtype, mem_scope);
+    RemoteSpace* space = new RemoteSpace();
+    space->data = data;
+    space->sess = std::move(sess);
+    return space;
+  }
+
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     auto sess = GetSess(ctx);
@@ -65,30 +77,36 @@ class RPCDeviceAPI final : public DeviceAPI {
     }
     delete space;
   }
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
+
+  void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+    DLContext ctx_from = from->ctx;
+    DLContext ctx_to = to->ctx;
     if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) {
       ICHECK(ctx_from.device_type == ctx_to.device_type)
           << "Cannot copy across two different remote session";
-      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
-      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
-      auto remote_ctx = remote_ctx_from;
-      if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to;
-      GetSess(ctx_from)
-          ->GetDeviceAPI(remote_ctx)
-          ->CopyDataFromTo(static_cast<const RemoteSpace*>(from)->data, from_offset,
-                           static_cast<const RemoteSpace*>(to)->data, to_offset, size,
-                           remote_ctx_from, remote_ctx_to, type_hint, stream);
+      DLTensor from_tensor = *from;
+      from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+      from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
+      DLTensor to_tensor = *to;
+      to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+      to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
+      auto remote_ctx = from_tensor.ctx;
+      if (remote_ctx.device_type == kDLCPU) remote_ctx = to_tensor.ctx;
+      GetSess(ctx_from)->GetDeviceAPI(remote_ctx)->CopyDataFromTo(&from_tensor, &to_tensor, stream);
     } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) {
-      auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
-      GetSess(ctx_from)->CopyFromRemote(static_cast<const RemoteSpace*>(from)->data, from_offset,
-                                        to, to_offset, size, remote_ctx_from, type_hint);
+      DLTensor from_tensor = *from;
+      from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+      from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
+      void* to_bytes = static_cast<char*>(to->data) + to->byte_offset;
+      size_t nbytes = GetDataSize(*to);
+      GetSess(ctx_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes);
     } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) {
-      auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
-      GetSess(ctx_to)->CopyToRemote(const_cast<void*>(from), from_offset,
-                                    static_cast<const RemoteSpace*>(to)->data, to_offset, size,
-                                    remote_ctx_to, type_hint);
+      DLTensor to_tensor = *to;
+      to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+      to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
+      void* from_bytes = static_cast<char*>(from->data) + from->byte_offset;
+      size_t nbytes = GetDataSize(*from);
+      GetSess(ctx_to)->CopyToRemote(from_bytes, &to_tensor, nbytes);
     } else {
       LOG(FATAL) << "expect copy from/to remote or between remote";
     }
@@ -99,6 +117,13 @@ class RPCDeviceAPI final : public DeviceAPI {
     GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream);
   }
 
+ protected:
+  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+                      size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+                      DLDataType type_hint, TVMStreamHandle stream) final {
+    LOG(FATAL) << "Not implemented.";
+  }
+
  private:
   std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
     int tbl_index = GetRPCSessionIndex(ctx);
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index fbdd93f..8716355 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -387,88 +387,72 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
   void HandleSyscall(RPCCode code);
 
   void HandleCopyFromRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-    this->Read(&handle);
-    this->Read(&offset);
-    this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
-    size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
-
+    DLTensor* arr = RPCReference::ReceiveDLTensor(this);
+    uint64_t data_bytes;
+    this->Read(&data_bytes);
+    size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
     auto* sess = GetServingSession();
-
     // Return Copy Ack with the given data
-    auto fcopyack = [this](char* data_ptr, size_t num_bytes) {
+    auto fcopyack = [this](char* dptr, size_t num_bytes) {
       RPCCode code = RPCCode::kCopyAck;
       uint64_t packet_nbytes = sizeof(code) + num_bytes;
 
       this->Write(packet_nbytes);
       this->Write(code);
-      this->WriteArray(data_ptr, num_bytes);
+      this->WriteArray(dptr, num_bytes);
       this->SwitchToState(kRecvPacketNumBytes);
     };
 
     // When session is local, we can directly treat handle
     // as the cpu pointer without allocating a temp space.
-    if (ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
-      char* data_ptr = reinterpret_cast<char*>(handle) + offset;
-      fcopyack(data_ptr, num_bytes);
+    if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
+      char* data_ptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
+      fcopyack(data_ptr, data_bytes);
     } else {
-      char* data_ptr = this->ArenaAlloc<char>(num_bytes);
-
-      auto on_copy_complete = [this, elem_bytes, num_bytes, data_ptr, fcopyack](RPCCode status,
-                                                                                TVMArgs args) {
+      char* temp_data = this->ArenaAlloc<char>(data_bytes);
+      auto on_copy_complete = [this, elem_bytes, data_bytes, temp_data, fcopyack](RPCCode status,
+                                                                                  TVMArgs args) {
         if (status == RPCCode::kException) {
           this->ReturnException(args.values[0].v_str);
           this->SwitchToState(kRecvPacketNumBytes);
         } else {
           // endian aware handling
           if (!DMLC_IO_NO_ENDIAN_SWAP) {
-            dmlc::ByteSwap(data_ptr, elem_bytes, num_bytes / elem_bytes);
+            dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes);
           }
-          fcopyack(data_ptr, num_bytes);
+          fcopyack(temp_data, data_bytes);
         }
       };
 
       this->SwitchToState(kWaitForAsyncCallback);
-      sess->AsyncCopyFromRemote(reinterpret_cast<void*>(handle), offset, data_ptr, 0, num_bytes,
-                                ctx, type_hint, on_copy_complete);
+      sess->AsyncCopyFromRemote(arr, static_cast<void*>(temp_data), data_bytes, on_copy_complete);
     }
   }
 
   void HandleCopyToRemote() {
-    uint64_t handle, offset, num_bytes;
-    TVMContext ctx;
-    DLDataType type_hint;
-
-    this->Read(&handle);
-    this->Read(&offset);
-    this->Read(&num_bytes);
-    this->Read(&ctx);
-    this->Read(&type_hint);
-
-    size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
+    DLTensor* arr = RPCReference::ReceiveDLTensor(this);
+    uint64_t data_bytes;
+    this->Read(&data_bytes);
+    size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
     auto* sess = GetServingSession();
 
     // When session is local, we can directly treat handle
     // as the cpu pointer without allocating a temp space.
-    if (ctx.device_type == kDLCPU && sess->IsLocalSession()) {
-      char* dptr = reinterpret_cast<char*>(handle) + offset;
-      this->ReadArray(dptr, num_bytes);
+    if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession()) {
+      char* dptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
+      this->ReadArray(dptr, data_bytes);
 
       if (!DMLC_IO_NO_ENDIAN_SWAP) {
-        dmlc::ByteSwap(dptr, elem_bytes, num_bytes / elem_bytes);
+        dmlc::ByteSwap(dptr, elem_bytes, data_bytes / elem_bytes);
       }
       this->ReturnVoid();
       this->SwitchToState(kRecvPacketNumBytes);
     } else {
-      char* temp_data = this->ArenaAlloc<char>(num_bytes);
-      this->ReadArray(temp_data, num_bytes);
+      char* temp_data = this->ArenaAlloc<char>(data_bytes);
+      this->ReadArray(temp_data, data_bytes);
 
       if (!DMLC_IO_NO_ENDIAN_SWAP) {
-        dmlc::ByteSwap(temp_data, elem_bytes, num_bytes / elem_bytes);
+        dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes);
       }
 
       auto on_copy_complete = [this](RPCCode status, TVMArgs args) {
@@ -482,8 +466,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
       };
 
       this->SwitchToState(kWaitForAsyncCallback);
-      sess->AsyncCopyToRemote(temp_data, 0, reinterpret_cast<void*>(handle), offset, num_bytes, ctx,
-                              type_hint, on_copy_complete);
+      sess->AsyncCopyToRemote(static_cast<void*>(temp_data), arr, data_bytes, on_copy_complete);
     }
   }
 
@@ -815,51 +798,47 @@ void RPCEndpoint::CallFunc(RPCSession::PackedFuncHandle h, const TVMValue* arg_v
   ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
 }
 
-void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                               size_t data_size, TVMContext ctx_to, DLDataType type_hint) {
+void RPCEndpoint::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) {
   std::lock_guard<std::mutex> lock(mutex_);
   RPCCode code = RPCCode::kCopyToRemote;
-  uint64_t handle = reinterpret_cast<uint64_t>(to);
-  uint64_t offset = static_cast<uint64_t>(to_offset);
-  uint64_t size = static_cast<uint64_t>(data_size);
 
-  uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) +
-                           sizeof(ctx_to) + sizeof(type_hint) + data_size;
+  uint64_t num_data_bytes = static_cast<uint64_t>(GetDataSize(*to));
+  ICHECK_EQ(nbytes, num_data_bytes);
+
+  uint64_t to_data = reinterpret_cast<uint64_t>(to->data);
+  uint64_t shape_bytes = to->ndim * sizeof(int64_t);
+  uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->ctx) + sizeof(to->ndim) +
+                           sizeof(to->dtype) + sizeof(to->byte_offset) + shape_bytes +
+                           sizeof(nbytes) + num_data_bytes;
 
   handler_->Write(packet_nbytes);
   handler_->Write(code);
-  handler_->Write(handle);
-  handler_->Write(offset);
-  handler_->Write(size);
-  handler_->Write(ctx_to);
-  handler_->Write(type_hint);
-  handler_->WriteArray(reinterpret_cast<char*>(from) + from_offset, data_size);
-
+  RPCReference::SendDLTensor(handler_, to);
+  handler_->Write(nbytes);
+  handler_->WriteArray(reinterpret_cast<char*>(from_bytes), nbytes);
   ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn);
 }
 
-void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                                 size_t data_size, TVMContext ctx_from, DLDataType type_hint) {
+void RPCEndpoint::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) {
   std::lock_guard<std::mutex> lock(mutex_);
   RPCCode code = RPCCode::kCopyFromRemote;
-  uint64_t handle = reinterpret_cast<uint64_t>(from);
-  uint64_t offset = static_cast<uint64_t>(from_offset);
-  uint64_t size = static_cast<uint64_t>(data_size);
 
-  uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) +
-                           sizeof(ctx_from) + sizeof(type_hint);
+  uint64_t num_data_bytes = static_cast<uint64_t>(GetDataSize(*from));
+  CHECK_EQ(nbytes, num_data_bytes);
+
+  uint64_t from_data = reinterpret_cast<uint64_t>(from->data);
+  uint64_t shape_bytes = from->ndim * sizeof(int64_t);
+  uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->ctx) +
+                           sizeof(from->ndim) + sizeof(from->dtype) + sizeof(from->byte_offset) +
+                           shape_bytes + sizeof(nbytes);
 
   handler_->Write(packet_nbytes);
   handler_->Write(code);
-  handler_->Write(handle);
-  handler_->Write(offset);
-  handler_->Write(size);
-  handler_->Write(ctx_from);
-  handler_->Write(type_hint);
-
-  TVMRetValue rv;
+  RPCReference::SendDLTensor(handler_, from);
+  handler_->Write(nbytes);
   ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck);
-  handler_->ReadArray(reinterpret_cast<char*>(to) + to_offset, data_size);
+
+  handler_->ReadArray(reinterpret_cast<char*>(to_bytes), nbytes);
   handler_->FinishCopyAck();
 }
 
@@ -904,6 +883,23 @@ void RPCDevAllocData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   *rv = data;
 }
 
+void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
+  DLTensor* arr = args[0];
+  TVMContext ctx = arr->ctx;
+  int ndim = arr->ndim;
+  int64_t* shape = arr->shape;
+  DLDataType dtype = arr->dtype;
+  int tcode = args[1].type_code();
+  Optional<String> mem_scope = NullOpt;
+  if (tcode == kTVMStr) {
+    mem_scope = args[1].operator String();
+  } else {
+    ICHECK_EQ(tcode, kTVMNullptr);
+  }
+  void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, mem_scope);
+  *rv = data;
+}
+
 void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
   TVMContext ctx = args[0];
   void* ptr = args[1];
@@ -911,25 +907,18 @@ void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
 }
 
 void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
-  void* from = args[0];
-  uint64_t from_offset = args[1];
-  void* to = args[2];
-  uint64_t to_offset = args[3];
-  uint64_t size = args[4];
-  TVMContext ctx_from = args[5];
-  TVMContext ctx_to = args[6];
-  DLDataType type_hint = args[7];
-  TVMStreamHandle stream = args[8];
-  TVMContext ctx = ctx_from;
+  DLTensor* from = args[0];
+  DLTensor* to = args[1];
+  TVMStreamHandle stream = args[2];
 
+  TVMContext ctx = from->ctx;
   if (ctx.device_type == kDLCPU) {
-    ctx = ctx_to;
+    ctx = to->ctx;
   } else {
-    ICHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type)
+    ICHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type)
         << "Can not copy across different ctx types directly";
   }
-  handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
-                                             ctx_to, type_hint, stream);
+  handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, to, stream);
 }
 
 void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
@@ -951,6 +940,9 @@ void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
     case RPCCode::kDevAllocData:
       SysCallHandler(RPCDevAllocData);
       break;
+    case RPCCode::kDevAllocDataWithScope:
+      SysCallHandler(RPCDevAllocDataWithScope);
+      break;
     case RPCCode::kDevFreeData:
       SysCallHandler(RPCDevFreeData);
       break;
@@ -989,14 +981,12 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
     endpoint_->CallFunc(func, arg_values, arg_type_codes, num_args, fencode_return);
   }
 
-  void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                    TVMContext ctx_to, DLDataType type_hint) final {
-    endpoint_->CopyToRemote(from, from_offset, to, to_offset, nbytes, ctx_to, type_hint);
+  void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) final {
+    endpoint_->CopyToRemote(local_from_bytes, remote_to, nbytes);
   }
 
-  void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                      TVMContext ctx_from, DLDataType type_hint) final {
-    endpoint_->CopyFromRemote(from, from_offset, to, to_offset, nbytes, ctx_from, type_hint);
+  void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) final {
+    endpoint_->CopyFromRemote(remote_from, local_to_bytes, nbytes);
   }
 
   void FreeHandle(void* handle, int type_code) final {
@@ -1019,15 +1009,30 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
     return endpoint_->SysCallRemote(RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint);
   }
 
+  void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                       Optional<String> mem_scope) final {
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.ctx = ctx;
+    temp.ndim = ndim;
+    temp.dtype = dtype;
+    temp.shape = const_cast<int64_t*>(shape);
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+    if (mem_scope.defined()) {
+      return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp,
+                                      static_cast<std::string>(mem_scope.value()));
+    } else {
+      return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp, nullptr);
+    }
+  }
+
   void FreeDataSpace(TVMContext ctx, void* ptr) final {
     endpoint_->SysCallRemote(RPCCode::kDevFreeData, ctx, ptr);
   }
 
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
-    endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, const_cast<void*>(from), from_offset, to,
-                             to_offset, size, ctx_from, ctx_to, type_hint, stream);
+  void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+    endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream);
   }
 
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index 031435f..8e08bfa 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -135,8 +135,7 @@ class RPCEndpoint {
    * \param ctx_to The target context.
    * \param type_hint Hint of content data type.
    */
-  void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                    TVMContext ctx_to, DLDataType type_hint);
+  void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes);
   /*!
    * \brief Copy bytes from remote array content.
    * \param from The source host data.
@@ -147,8 +146,7 @@ class RPCEndpoint {
    * \param ctx_from The source context.
    * \param type_hint Hint of content data type.
    */
-  void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                      TVMContext ctx_from, DLDataType type_hint);
+  void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes);
 
   /*!
    * \brief Call a remote defined system function with arguments.
diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc
index b35c62d..0650b55d 100644
--- a/src/runtime/rpc/rpc_local_session.cc
+++ b/src/runtime/rpc/rpc_local_session.cc
@@ -87,26 +87,36 @@ void LocalSession::CallFunc(RPCSession::PackedFuncHandle func, const TVMValue* a
   this->EncodeReturn(std::move(rv), encode_return);
 }
 
-void LocalSession::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                                size_t nbytes, TVMContext ctx_to, DLDataType type_hint) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-  this->GetDeviceAPI(ctx_to)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, cpu_ctx,
-                                             ctx_to, type_hint, nullptr);
+void LocalSession::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) {
+  ICHECK_EQ(nbytes, GetDataSize(*to));
+  DLTensor from;
+  from.data = from_bytes;
+  from.ctx = {kDLCPU, 0};
+  from.ndim = to->ndim;
+  from.shape = to->shape;
+  from.dtype = to->dtype;
+  from.strides = nullptr;
+  from.byte_offset = 0;
+  TVMContext ctx_to = to->ctx;
+  this->GetDeviceAPI(ctx_to)->CopyDataFromTo(&from, to, nullptr);
   // Copy can happen asynchrously
   // synchronize to make sure that copy is completed
   this->GetDeviceAPI(ctx_to)->StreamSync(ctx_to, nullptr);
 }
 
-void LocalSession::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
-                                  size_t nbytes, TVMContext ctx_from, DLDataType type_hint) {
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-
-  this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, ctx_from,
-                                               cpu_ctx, type_hint, nullptr);
+void LocalSession::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) {
+  ICHECK_EQ(nbytes, GetDataSize(*from));
+  DLTensor to;
+  to.data = to_bytes;
+  to.ctx = {kDLCPU, 0};
+  to.ndim = from->ndim;
+  to.shape = from->shape;
+  to.dtype = from->dtype;
+  to.strides = nullptr;
+  to.byte_offset = 0;
+
+  TVMContext ctx_from = from->ctx;
+  this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, &to, nullptr);
   // Copy can happen asynchrously
   // synchronize to make sure that copy is completed
   this->GetDeviceAPI(ctx_from)->StreamSync(ctx_from, nullptr);
diff --git a/src/runtime/rpc/rpc_local_session.h b/src/runtime/rpc/rpc_local_session.h
index 7a67ce8..ea070e3 100644
--- a/src/runtime/rpc/rpc_local_session.h
+++ b/src/runtime/rpc/rpc_local_session.h
@@ -48,11 +48,9 @@ class LocalSession : public RPCSession {
   void CallFunc(PackedFuncHandle func, const TVMValue* arg_values, const int* arg_type_codes,
                 int num_args, const FEncodeReturn& fencode_return) override;
 
-  void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                    TVMContext ctx_to, DLDataType type_hint) override;
+  void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) override;
 
-  void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
-                      TVMContext ctx_from, DLDataType type_hint) override;
+  void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) override;
 
   void FreeHandle(void* handle, int type_code) override;
 
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index f5405f0..0ac5b8d 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -51,33 +51,28 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values
   }
 }
 
-void RPCSession::AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                                   size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                                   DLDataType type_hint, RPCSession::FAsyncCallback callback) {
+void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+                                   RPCSession::FAsyncCallback callback) {
   TVMValue value;
   int32_t tcode = kTVMNullptr;
   value.v_handle = nullptr;
 
   try {
-    this->CopyToRemote(local_from, local_from_offset, remote_to, remote_to_offset, nbytes,
-                       remote_ctx_to, type_hint);
+    this->CopyToRemote(local_from_bytes, remote_to, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
   } catch (const std::runtime_error& e) {
     this->SendException(callback, e.what());
   }
 }
 
-void RPCSession::AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                                     size_t local_to_offset, size_t nbytes,
-                                     TVMContext remote_ctx_from, DLDataType type_hint,
+void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
                                      RPCSession::FAsyncCallback callback) {
   TVMValue value;
   int32_t tcode = kTVMNullptr;
   value.v_handle = nullptr;
 
   try {
-    this->CopyFromRemote(remote_from, remote_from_offset, local_to, local_to_offset, nbytes,
-                         remote_ctx_from, type_hint);
+    this->CopyFromRemote(remote_from, local_to_bytes, nbytes);
     callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
   } catch (const std::runtime_error& e) {
     this->SendException(callback, e.what());
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 4ea937a..4b942f2 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -127,30 +127,18 @@ class RPCSession {
 
   /*!
    * \brief Copy bytes into remote array content.
-   * \param local_from The source host data.
-   * \param local_from_offset The byte offeset in the from.
+   * \param local_from_bytes The source host data.
    * \param remote_to The target array.
-   * \param remote_to_offset The byte offset in the to.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_to The target context.
-   * \param type_hint Hint of content data type.
    */
-  virtual void CopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                            size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                            DLDataType type_hint) = 0;
+  virtual void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) = 0;
   /*!
    * \brief Copy bytes from remote array content.
    * \param remote_from The source host data.
-   * \param remote_from_offset The byte offeset in the from.
-   * \param to The target array.
-   * \param to_offset The byte offset in the to.
+   * \param local_to_bytes The target array.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_from The source context in the remote.
-   * \param type_hint Hint of content data type.
    */
-  virtual void CopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                              size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from,
-                              DLDataType type_hint) = 0;
+  virtual void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) = 0;
 
   /*!
    * \brief Free a remote function.
@@ -223,40 +211,27 @@ class RPCSession {
   /*!
    * \brief Asynchrous version of CopyToRemote.
    *
-   * \param local_from The source host data.
-   * \param local_from_offset The byte offeset in the from.
+   * \param local_from_bytes The source host data.
    * \param remote_to The target array.
-   * \param remote_to_offset The byte offset in the to.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_to The target context.
-   * \param type_hint Hint of content data type.
-   *
    * \param on_complete The callback to signal copy complete.
    * \note All the allocated memory in local_from, and remote_to
    *       must stay alive until on_compelete is called.
    */
-  virtual void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                                 size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                                 DLDataType type_hint, FAsyncCallback on_complete);
+  virtual void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+                                 FAsyncCallback on_complete);
 
   /*!
    * \brief Asynchrous version of CopyFromRemote.
    *
    * \param remote_from The source host data.
-   * \param remote_from_offset The byte offeset in the from.
-   * \param to The target array.
-   * \param to_offset The byte offset in the to.
+   * \param local_to_bytes The target array.
    * \param nbytes The size of the memory in bytes.
-   * \param remote_ctx_from The source context in the remote.
-   * \param type_hint Hint of content data type.
-   *
    * \param on_complete The callback to signal copy complete.
    * \note All the allocated memory in remote_from, and local_to
    *       must stay alive until on_compelete is called.
    */
-  virtual void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                                   size_t local_to_offset, size_t nbytes,
-                                   TVMContext remote_ctx_from, DLDataType type_hint,
+  virtual void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
                                    FAsyncCallback on_complete);
   /*!
    * \brief Asynchrously wait for all events in ctx, stream compeletes.
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index cbf1974..f40fd80 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -199,6 +199,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
     delete pbuf;
   }
 
+ protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
@@ -307,6 +308,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
     }
   }
 
+ public:
   // Always use the default stream
   TVMStreamHandle CreateStream(TVMContext ctx) {
     LOG(FATAL) << "Not implemented";
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index 6abd122..b72caad 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -177,33 +177,37 @@ class AsyncLocalSession : public LocalSession {
     }
   }
 
-  void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
-                         size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
-                         DLDataType type_hint, FAsyncCallback on_complete) final {
-    TVMContext cpu_ctx;
-    cpu_ctx.device_type = kDLCPU;
-    cpu_ctx.device_id = 0;
+  void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+                         FAsyncCallback on_complete) final {
     try {
-      this->GetDeviceAPI(remote_ctx_to)
-          ->CopyDataFromTo(local_from, local_from_offset, remote_to, remote_to_offset, nbytes,
-                           cpu_ctx, remote_ctx_to, type_hint, nullptr);
-      this->AsyncStreamWait(remote_ctx_to, nullptr, on_complete);
+      DLTensor local_from;
+      local_from.data = local_from_bytes;
+      local_from.ctx = TVMContext{kDLCPU, 0};
+      local_from.ndim = remote_to->ndim;
+      local_from.shape = remote_to->shape;
+      local_from.dtype = remote_to->dtype;
+      local_from.strides = nullptr;
+      local_from.byte_offset = 0;
+      this->GetDeviceAPI(remote_to->ctx)->CopyDataFromTo(&local_from, remote_to, nullptr);
+      this->AsyncStreamWait(remote_to->ctx, nullptr, on_complete);
     } catch (const std::runtime_error& e) {
       this->SendException(on_complete, e.what());
     }
   }
 
-  void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
-                           size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from,
-                           DLDataType type_hint, FAsyncCallback on_complete) final {
-    TVMContext cpu_ctx;
-    cpu_ctx.device_type = kDLCPU;
-    cpu_ctx.device_id = 0;
+  void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
+                           FAsyncCallback on_complete) final {
     try {
-      this->GetDeviceAPI(remote_ctx_from)
-          ->CopyDataFromTo(remote_from, remote_from_offset, local_to, local_to_offset, nbytes,
-                           remote_ctx_from, cpu_ctx, type_hint, nullptr);
-      this->AsyncStreamWait(remote_ctx_from, nullptr, on_complete);
+      DLTensor local_to;
+      local_to.data = local_to_bytes;
+      local_to.ctx = TVMContext{kDLCPU, 0};
+      local_to.ndim = remote_from->ndim;
+      local_to.shape = remote_from->shape;
+      local_to.dtype = remote_from->dtype;
+      local_to.strides = nullptr;
+      local_to.byte_offset = 0;
+      this->GetDeviceAPI(remote_from->ctx)->CopyDataFromTo(&local_to, remote_from, nullptr);
+      this->AsyncStreamWait(remote_from->ctx, nullptr, on_complete);
     } catch (const std::runtime_error& e) {
       this->SendException(on_complete, e.what());
     }
diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc
index 54601e3..62b87af 100644
--- a/web/emcc/webgpu_runtime.cc
+++ b/web/emcc/webgpu_runtime.cc
@@ -82,6 +82,7 @@ class WebGPUDeviceAPI : public DeviceAPI {
 
   void FreeDataSpace(TVMContext ctx, void* ptr) final { return free_space_(ptr); }
 
+ protected:
   void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
                       TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
                       TVMStreamHandle stream) final {
@@ -102,6 +103,7 @@ class WebGPUDeviceAPI : public DeviceAPI {
     }
   }
 
+ public:
   TVMStreamHandle CreateStream(TVMContext ctx) final {
     LOG(FATAL) << "Not implemented";
     return nullptr;