You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2021/02/28 15:52:33 UTC
[tvm] branch main updated: [Runtime] Special Memory Scope Support
(#7488)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 2673309 [Runtime] Special Memory Scope Support (#7488)
2673309 is described below
commit 26733095f5a1e0887c32d644429d430bc1f51c91
Author: ziheng <zi...@apache.org>
AuthorDate: Sun Feb 28 23:52:16 2021 +0800
[Runtime] Special Memory Scope Support (#7488)
---
include/tvm/runtime/c_runtime_api.h | 35 ++++--
include/tvm/runtime/device_api.h | 42 +++++--
include/tvm/runtime/ndarray.h | 7 +-
python/tvm/runtime/ndarray.py | 38 +++---
src/runtime/c_runtime_api.cc | 64 +++++++++-
src/runtime/cpu_device_api.cc | 13 +-
src/runtime/crt/common/crt_runtime_api.c | 39 +++++-
src/runtime/cuda/cuda_device_api.cc | 2 +
src/runtime/hexagon/hexagon_device_api.cc | 8 +-
src/runtime/metal/metal_common.h | 8 +-
src/runtime/minrpc/minrpc_server.h | 137 ++++++++++++--------
src/runtime/minrpc/rpc_reference.h | 76 ++++++-----
src/runtime/ndarray.cc | 65 ++++++----
src/runtime/opencl/opencl_common.h | 8 +-
src/runtime/rpc/rpc_device_api.cc | 63 +++++++---
src/runtime/rpc/rpc_endpoint.cc | 201 +++++++++++++++---------------
src/runtime/rpc/rpc_endpoint.h | 6 +-
src/runtime/rpc/rpc_local_session.cc | 40 +++---
src/runtime/rpc/rpc_local_session.h | 6 +-
src/runtime/rpc/rpc_session.cc | 15 +--
src/runtime/rpc/rpc_session.h | 43 ++-----
src/runtime/vulkan/vulkan.cc | 2 +
web/emcc/tvmjs_support.cc | 44 ++++---
web/emcc/webgpu_runtime.cc | 2 +
24 files changed, 577 insertions(+), 387 deletions(-)
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 467e69a..59316a0 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -560,6 +560,23 @@ TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignme
DLDataType type_hint, void** out_data);
/*!
+ * \brief Allocate a data space on device with special memory scope.
+ * \note The memory could use a special multi-dimensional memory layout.
+ * That is why we pass shape and dtype instead of raw number of bytes.
+ * \param ctx The device context to perform operation.
+ * \param ndim The number of dimension of the tensor.
+ * \param shape The shape of the tensor.
+ * \param dtype The type of elements.
+ * \param mem_scope The memory scope of the tensor,
+ * can be nullptr, which indicate the default global DRAM
+ * \param out_data The allocated device pointer.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+ DLDataType dtype, const char* mem_scope,
+ void** out_data);
+
+/*!
* \brief Free a data space on device.
* \param ctx The device context to perform operation.
* \param ptr The data space.
@@ -569,22 +586,14 @@ TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr);
/*!
* \brief Copy data from one place to another.
- * \param from The source array.
- * \param from_offset The byte offeset in the from.
- * \param to The target array.
- * \param to_offset The byte offset in the to.
- * \param num_bytes The size of the memory in bytes
- * \param ctx_from The source context
- * \param ctx_to The target context
- * \param type_hint The type of elements, only neded by certain backends.
- * can be useful for cross device endian converison.
+ * \note This API is designed to support special memory with shape dependent layout.
+ * We pass in DLTensor* with shape information to support these cases.
+ * \param from The source tensor.
+ * \param to The target tensor.
* \param stream Optional stream object.
* \return 0 when success, -1 when failure happens.
*/
-TVM_DLL int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to,
- size_t to_offset, size_t num_bytes, TVMContext ctx_from,
- TVMContext ctx_to, DLDataType type_hint,
- TVMStreamHandle stream);
+TVM_DLL int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
/*!
* \brief Check that an object is derived from another.
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index a6f5624..1276663 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -91,6 +91,17 @@ class TVM_DLL DeviceAPI {
virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
DLDataType type_hint) = 0;
/*!
+ * \brief Allocate a data space on device with memory scope support.
+ * \param ctx The device context to perform operation.
+ * \param ndim The number of dimension of allocated tensor.
+ * \param shape The shape of allocated tensor.
+ * \param dtype The type of elements.
+ * \param mem_scope The memory scope of allocated tensor.
+ * \return The allocated device pointer.
+ */
+ virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+ Optional<String> mem_scope = NullOpt);
+ /*!
* \brief Free a data space on device.
* \param ctx The device context to perform operation.
* \param ptr The data space.
@@ -98,20 +109,13 @@ class TVM_DLL DeviceAPI {
virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0;
/*!
* \brief copy data from one place to another
+ * \note This API is designed to support special memory with shape dependent layout.
+ * We pass in DLTensor* with shape information to support these cases.
* \param from The source array.
- * \param from_offset The byte offeset in the from.
* \param to The target array.
- * \param to_offset The byte offset in the to.
- * \param num_bytes The size of the memory in bytes
- * \param ctx_from The source context
- * \param ctx_to The target context
- * \param type_hint The type of elements, only neded by certain backends.
- * can be useful for cross device endian converison.
* \param stream Optional stream object.
*/
- virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
- size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
- DLDataType type_hint, TVMStreamHandle stream) = 0;
+ virtual void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream);
/*!
* \brief Create a new stream of execution.
*
@@ -194,6 +198,24 @@ class TVM_DLL DeviceAPI {
static bool NeedSetDeviceContext(int device_type) {
return device_type != kDLCPU && device_type != kDLMicroDev;
}
+
+ protected:
+ /*!
+ * \brief copy data from one place to another
+ * \param from The source array.
+ * \param from_offset The byte offeset in the from.
+ * \param to The target array.
+ * \param to_offset The byte offset in the to.
+ * \param num_bytes The size of the memory in bytes
+ * \param ctx_from The source context
+ * \param ctx_to The target context
+ * \param type_hint The type of elements, only neded by certain backends.
+ * can be useful for cross device endian converison.
+ * \param stream Optional stream object.
+ */
+ virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+ size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+ DLDataType type_hint, TVMStreamHandle stream);
};
/*! \brief The device type bigger than this is RPC device */
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 0ff171d..a884b5c 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -25,6 +25,7 @@
#define TVM_RUNTIME_NDARRAY_H_
#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container.h>
#include <tvm/runtime/data_type.h>
#include <tvm/runtime/object.h>
#include <tvm/runtime/serializer.h>
@@ -133,10 +134,12 @@ class NDArray : public ObjectRef {
* \brief Create an empty NDArray.
* \param shape The shape of the new array.
* \param dtype The data type of the new array.
- * \param ctx The context of the Array.
+ * \param ctx The context of the array.
+ * \param mem_scope The memory scope of the array.
* \return The created Array
*/
- TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
+ TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+ Optional<String> mem_scope = NullOpt);
/*!
* \brief Create a NDArray backed by a dlpack tensor.
*
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 2f616ce..75da3d4 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -23,6 +23,7 @@ import tvm._ffi
from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE
from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle
from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t
+from . import _ffi_api
try:
# pylint: disable=wrong-import-position
@@ -253,42 +254,41 @@ def numpyasarray(np_data):
return arr, shape
-def empty(shape, dtype="float32", ctx=context(1, 0)):
+def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None):
"""Create an empty array given shape and device
Parameters
----------
shape : tuple of int
- The shape of the array
+ The shape of the array.
dtype : type or str
The data type of the array.
ctx : TVMContext
- The context of the array
+ The context of the array.
+
+ mem_scope : Optional[str]
+ The memory scope of the array.
Returns
-------
arr : tvm.nd.NDArray
The array tvm supported.
"""
- shape = c_array(tvm_shape_index_t, shape)
- ndim = ctypes.c_int(len(shape))
- handle = TVMArrayHandle()
+ shape_imm = []
+ for s in shape:
+ if isinstance(s, tvm.tir.IntImm):
+ shape_imm.append(s.value)
+ else:
+ shape_imm.append(int(s))
+ arr = np.array(shape_imm, "int64")
+ ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
+ shape_ptr = ctypes.cast(ptr, ctypes.c_void_p)
+ ndim = len(shape_imm)
dtype = DataType(dtype)
- check_call(
- _LIB.TVMArrayAlloc(
- shape,
- ndim,
- ctypes.c_int(dtype.type_code),
- ctypes.c_int(dtype.bits),
- ctypes.c_int(dtype.lanes),
- ctx.device_type,
- ctx.device_id,
- ctypes.byref(handle),
- )
- )
- return _make_array(handle, False, False)
+ arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope)
+ return arr
def from_dlpack(dltensor):
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index b4457bf..7fd27cb 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -144,6 +144,50 @@ void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hin
return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
}
+static size_t GetDataAlignment(const DLDataType dtype) {
+ size_t align = (dtype.bits / 8) * dtype.lanes;
+ if (align < kAllocAlignment) return kAllocAlignment;
+ return align;
+}
+
+void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+ Optional<String> mem_scope) {
+ if (!mem_scope.defined() || mem_scope.value() == "global") {
+ // by default, we can always redirect to the flat memory allocations
+ DLTensor temp;
+ temp.data = nullptr;
+ temp.ctx = ctx;
+ temp.ndim = ndim;
+ temp.dtype = dtype;
+ temp.shape = const_cast<int64_t*>(shape);
+ temp.strides = nullptr;
+ temp.byte_offset = 0;
+ size_t size = GetDataSize(temp);
+ size_t alignment = GetDataAlignment(temp.dtype);
+ return AllocDataSpace(ctx, size, alignment, dtype);
+ }
+ LOG(FATAL) << "Device does not support allocate data space with "
+ << "specified memory scope: " << mem_scope.value();
+ return nullptr;
+}
+
+void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+ // by default, we can always redirect to the flat memory copy operation.
+ size_t nbytes = GetDataSize(*from);
+ ICHECK_EQ(nbytes, GetDataSize(*to));
+
+ ICHECK(IsContiguous(*from) && IsContiguous(*to))
+ << "CopyDataFromTo only support contiguous array for now";
+ CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx,
+ to->ctx, from->dtype, stream);
+}
+
+void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+ size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+ DLDataType type_hint, TVMStreamHandle stream) {
+ LOG(FATAL) << "Device does not support CopyDataFromTo.";
+}
+
void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); }
TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) {
@@ -553,19 +597,29 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
API_END();
}
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+ DLDataType dtype, const char* mem_scope, void** out_data) {
+ API_BEGIN();
+ Optional<String> scope;
+ if (mem_scope != nullptr) {
+ scope = String(std::string(mem_scope));
+ }
+ out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope);
+ API_END();
+}
+
int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) {
API_BEGIN();
DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr);
API_END();
}
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
- size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
- DLDataType type_hint, TVMStreamHandle stream) {
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
API_BEGIN();
+ TVMContext ctx_from = from->ctx;
+ TVMContext ctx_to = to->ctx;
TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to;
- DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, num_bytes, ctx_from,
- ctx_to, type_hint, stream);
+ DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream);
API_END();
}
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 146bfa8..b745be3 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -69,12 +69,6 @@ class CPUDeviceAPI final : public DeviceAPI {
#endif
}
- void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
- TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
- TVMStreamHandle stream) final {
- memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
- }
-
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {}
void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
@@ -86,6 +80,13 @@ class CPUDeviceAPI final : public DeviceAPI {
static auto* inst = new CPUDeviceAPI();
return inst;
}
+
+ protected:
+ void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+ TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+ TVMStreamHandle stream) final {
+ memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
+ }
};
struct CPUWorkspacePool : public WorkspacePool {
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index bc47f99..c2eb1ff 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -22,6 +22,7 @@
#include <assert.h>
#include <inttypes.h>
#include <stdarg.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -87,16 +88,44 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa
if (alignment != 1) {
nbytes = (nbytes + alignment - 1) / alignment * alignment;
}
-
return TVMPlatformMemoryAllocate(nbytes, ctx, out_data);
}
+int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape,
+ DLDataType dtype, const char* mem_scope, void** out_data) {
+ size_t nbytes = 1;
+ for (int i = 0; i < ndim; ++i) {
+ nbytes *= shape[i];
+ }
+ nbytes *= (dtype.bits * dtype.lanes + 7) / 8;
+
+ int kAllocAlignment = 128;
+ size_t align = (dtype.bits / 8) * dtype.lanes;
+ if (align < kAllocAlignment) align = kAllocAlignment;
+ return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data);
+}
+
int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); }
-int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
- size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
- DLDataType type_hint, TVMStreamHandle stream) {
- memcpy(((uint8_t*)to) + to_offset, ((uint8_t*)from) + from_offset, num_bytes);
+static bool IsContiguous(const DLTensor* arr) {
+ if (arr->strides == NULL) return true;
+ int64_t expected_stride = 1;
+ for (int32_t i = arr->ndim; i != 0; --i) {
+ int32_t k = i - 1;
+ if (arr->strides[k] != expected_stride) return false;
+ expected_stride *= arr->shape[k];
+ }
+ return true;
+}
+
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+ assert(IsContiguous(from) && IsContiguous(to));
+ size_t size = 1;
+ for (int i = 0; i < from->ndim; ++i) {
+ size *= from->shape[i];
+ }
+ size *= (from->dtype.bits * from->dtype.lanes + 7) / 8;
+ memcpy(((uint8_t*)to->data) + to->byte_offset, ((uint8_t*)from->data) + from->byte_offset, size);
return 0;
}
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 30abfc8..c773954 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -127,6 +127,7 @@ class CUDADeviceAPI final : public DeviceAPI {
}
}
+ protected:
void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream) final {
@@ -166,6 +167,7 @@ class CUDADeviceAPI final : public DeviceAPI {
}
}
+ public:
TVMStreamHandle CreateStream(TVMContext ctx) {
CUDA_CALL(cudaSetDevice(ctx.device_id));
cudaStream_t retval;
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index 605c55e..70cebf5 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -35,9 +35,6 @@ class HexagonDeviceAPI : public DeviceAPI {
void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
void FreeDataSpace(TVMContext ctx, void* ptr) final;
- void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
- size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
- DLDataType type_hint, TVMStreamHandle stream) final;
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}) final;
void FreeWorkspace(TVMContext ctx, void* ptr) final;
@@ -48,6 +45,11 @@ class HexagonDeviceAPI : public DeviceAPI {
static HexagonDeviceAPI* inst = new HexagonDeviceAPI();
return inst;
}
+
+ protected:
+ void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+ size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+ DLDataType type_hint, TVMStreamHandle stream) final;
};
// HexagonDeviceAPI.
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index d13ac7e..bd07dbf 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -84,14 +84,16 @@ class MetalWorkspace final : public DeviceAPI {
void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final;
void FreeDataSpace(TVMContext ctx, void* ptr) final;
- void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size,
- TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
- TVMStreamHandle stream) final;
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
void FreeWorkspace(TVMContext ctx, void* data) final;
// get the global workspace
static MetalWorkspace* Global();
+
+ protected:
+ void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size,
+ TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+ TVMStreamHandle stream) final;
};
/*! \brief Thread local workspace */
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index d28e0c3..d5c61ec 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -169,28 +169,39 @@ class MinRPCServer {
}
void HandleCopyFromRemote() {
- uint64_t handle, offset, num_bytes;
- TVMContext ctx;
- DLDataType type_hint;
-
- this->Read(&handle);
- this->Read(&offset);
+ DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+ uint64_t data_handle;
+ this->Read(&data_handle);
+ arr->data = reinterpret_cast<void*>(data_handle);
+ this->Read(&(arr->ctx));
+ this->Read(&(arr->ndim));
+ this->Read(&(arr->dtype));
+ arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+ this->ReadArray(arr->shape, arr->ndim);
+ arr->strides = nullptr;
+ this->Read(&(arr->byte_offset));
+
+ uint64_t num_bytes;
this->Read(&num_bytes);
- this->Read(&ctx);
- this->Read(&type_hint);
uint8_t* data_ptr;
int call_ecode = 0;
- if (ctx.device_type == kDLCPU) {
- data_ptr = reinterpret_cast<uint8_t*>(handle) + offset;
+ if (arr->ctx.device_type == kDLCPU) {
+ data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
} else {
data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
- call_ecode =
- TVMDeviceCopyDataFromTo(reinterpret_cast<void*>(handle), offset, data_ptr, 0, num_bytes,
- ctx, DLContext{kDLCPU, 0}, type_hint, nullptr);
+ DLTensor temp;
+ temp.data = reinterpret_cast<void*>(data_ptr);
+ temp.ctx = arr->ctx;
+ temp.ndim = arr->ndim;
+ temp.dtype = arr->dtype;
+ temp.shape = arr->shape;
+ temp.strides = nullptr;
+ temp.byte_offset = 0;
+ call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr);
// need sync to make sure that the copy is completed.
if (call_ecode == 0) {
- call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+ call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
}
}
@@ -209,30 +220,39 @@ class MinRPCServer {
}
void HandleCopyToRemote() {
- uint64_t handle, offset, num_bytes;
- TVMContext ctx;
- DLDataType type_hint;
-
- this->Read(&handle);
- this->Read(&offset);
+ DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+ uint64_t data_handle;
+ this->Read(&data_handle);
+ arr->data = reinterpret_cast<void*>(data_handle);
+ this->Read(&(arr->ctx));
+ this->Read(&(arr->ndim));
+ this->Read(&(arr->dtype));
+ arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+ this->ReadArray(arr->shape, arr->ndim);
+ arr->strides = nullptr;
+ this->Read(&(arr->byte_offset));
+ uint64_t num_bytes;
this->Read(&num_bytes);
- this->Read(&ctx);
- this->Read(&type_hint);
- int call_ecode = 0;
- if (ctx.device_type == kDLCPU) {
- uint8_t* dptr = reinterpret_cast<uint8_t*>(handle) + offset;
+ int call_ecode = 0;
+ if (arr->ctx.device_type == kDLCPU) {
+ uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
this->ReadArray(dptr, num_bytes);
} else {
uint8_t* temp_data = this->ArenaAlloc<uint8_t>(num_bytes);
this->ReadArray(temp_data, num_bytes);
-
- call_ecode =
- TVMDeviceCopyDataFromTo(temp_data, 0, reinterpret_cast<void*>(handle), offset, num_bytes,
- DLContext{kDLCPU, 0}, ctx, type_hint, nullptr);
+ DLTensor temp;
+ temp.data = temp_data;
+ temp.ctx = DLContext{kDLCPU, 0};
+ temp.ndim = arr->ndim;
+ temp.dtype = arr->dtype;
+ temp.shape = arr->shape;
+ temp.strides = nullptr;
+ temp.byte_offset = 0;
+ call_ecode = TVMDeviceCopyDataFromTo(&temp, arr, nullptr);
// need sync to make sure that the copy is completed.
if (call_ecode == 0) {
- call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+ call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr);
}
}
@@ -269,6 +289,10 @@ class MinRPCServer {
this->SyscallDevAllocData(values, tcodes, num_args);
break;
}
+ case RPCCode::kDevAllocDataWithScope: {
+ this->SyscallDevAllocDataWithScope(values, tcodes, num_args);
+ break;
+ }
case RPCCode::kDevFreeData: {
this->SyscallDevFreeData(values, tcodes, num_args);
break;
@@ -342,34 +366,20 @@ class MinRPCServer {
}
void SyscallCopyAmongRemote(TVMValue* values, int* tcodes, int num_args) {
- MINRPC_CHECK(num_args == 9);
- // from, from_offset
- MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle);
- MINRPC_CHECK(tcodes[1] == kDLInt);
- // to, to_offset
+ MINRPC_CHECK(num_args == 3);
+ // from dltensor
+ MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
+ // to dltensor
+ MINRPC_CHECK(tcodes[1] == kTVMDLTensorHandle);
+ // stream
MINRPC_CHECK(tcodes[2] == kTVMOpaqueHandle);
- MINRPC_CHECK(tcodes[3] == kDLInt);
- // size
- MINRPC_CHECK(tcodes[4] == kDLInt);
- // ctx_from, ctx_to
- MINRPC_CHECK(tcodes[5] == kTVMContext);
- MINRPC_CHECK(tcodes[6] == kTVMContext);
- // type_hint, stream
- MINRPC_CHECK(tcodes[7] == kTVMDataType);
- MINRPC_CHECK(tcodes[8] == kTVMOpaqueHandle);
void* from = values[0].v_handle;
- int64_t from_offset = values[1].v_int64;
- void* to = values[2].v_handle;
- int64_t to_offset = values[3].v_int64;
- int64_t size = values[4].v_int64;
- TVMContext ctx_from = values[5].v_ctx;
- TVMContext ctx_to = values[6].v_ctx;
- DLDataType type_hint = values[7].v_type;
- TVMStreamHandle stream = values[8].v_handle;
-
- int call_ecode = TVMDeviceCopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
- ctx_to, type_hint, stream);
+ void* to = values[1].v_handle;
+ TVMStreamHandle stream = values[2].v_handle;
+
+ int call_ecode = TVMDeviceCopyDataFromTo(reinterpret_cast<DLTensor*>(from),
+ reinterpret_cast<DLTensor*>(to), stream);
if (call_ecode == 0) {
this->ReturnVoid();
@@ -400,6 +410,23 @@ class MinRPCServer {
}
}
+ void SyscallDevAllocDataWithScope(TVMValue* values, int* tcodes, int num_args) {
+ MINRPC_CHECK(num_args == 2);
+ MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
+ MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr);
+
+ DLTensor* arr = reinterpret_cast<DLTensor*>(values[0].v_handle);
+ const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str);
+ void* handle;
+ int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->ctx, arr->ndim, arr->shape, arr->dtype,
+ mem_scope, &handle);
+ if (call_ecode == 0) {
+ this->ReturnHandle(handle);
+ } else {
+ this->ReturnLastTVMError();
+ }
+ }
+
void SyscallDevFreeData(TVMValue* values, int* tcodes, int num_args) {
MINRPC_CHECK(num_args == 2);
MINRPC_CHECK(tcodes[0] == kTVMContext);
diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h
index e195b9c..07d13a7 100644
--- a/src/runtime/minrpc/rpc_reference.h
+++ b/src/runtime/minrpc/rpc_reference.h
@@ -28,7 +28,7 @@ namespace tvm {
namespace runtime {
/*! \brief The current RPC procotol version. */
-constexpr const char* kRPCProtocolVer = "0.7.0";
+constexpr const char* kRPCProtocolVer = "0.8.0";
/*! \brief The RPC code */
enum class RPCCode : int {
@@ -51,6 +51,7 @@ enum class RPCCode : int {
kDevFreeData,
kDevStreamSync,
kCopyAmongRemote,
+ kDevAllocDataWithScope,
};
/*!
@@ -107,6 +108,8 @@ inline const char* RPCCodeToString(RPCCode code) {
return "kDevStreamSync";
case RPCCode::kCopyAmongRemote:
return "kCopyAmongRemote";
+ case RPCCode::kDevAllocDataWithScope:
+ return "kDevAllocDataWithScope";
default:
return "";
}
@@ -218,6 +221,44 @@ struct RPCReference {
return getter.num_bytes();
}
+ template <typename TChannelPtr>
+ static void SendDLTensor(TChannelPtr channel, DLTensor* arr) {
+ TVMContext ctx;
+ uint64_t data;
+ // When we return NDArray, we directly return
+ // the space and the context
+ // The client will be further wrapping
+ ctx = arr->ctx;
+ data = reinterpret_cast<uint64_t>(arr->data);
+ channel->Write(data);
+ channel->Write(ctx);
+ channel->Write(arr->ndim);
+ channel->Write(arr->dtype);
+ channel->WriteArray(arr->shape, arr->ndim);
+ if (arr->strides != nullptr) {
+ channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride);
+ }
+ channel->Write(arr->byte_offset);
+ return;
+ }
+
+ template <typename TChannelPtr>
+ static DLTensor* ReceiveDLTensor(TChannelPtr channel) {
+ uint64_t handle;
+ channel->Read(&handle);
+ DLTensor* arr = channel->template ArenaAlloc<DLTensor>(1);
+ DLTensor& tensor = *arr;
+ tensor.data = reinterpret_cast<void*>(handle);
+ channel->Read(&(tensor.ctx));
+ channel->Read(&(tensor.ndim));
+ channel->Read(&(tensor.dtype));
+ tensor.shape = channel->template ArenaAlloc<int64_t>(tensor.ndim);
+ channel->ReadArray(tensor.shape, tensor.ndim);
+ tensor.strides = nullptr;
+ channel->Read(&(tensor.byte_offset));
+ return arr;
+ }
+
/*!
* \brief Send packed argument sequnce to the other peer.
*
@@ -292,24 +333,7 @@ struct RPCReference {
}
case kTVMDLTensorHandle: {
DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
- TVMContext ctx;
- uint64_t data;
- // When we return NDArray, we directly return
- // the space and the context
- // The client will be further wrapping
- ctx = arr->ctx;
- data = reinterpret_cast<uint64_t>(arr->data);
- channel->Write(data);
- channel->Write(ctx);
- channel->Write(arr->ndim);
- channel->Write(arr->dtype);
- channel->WriteArray(arr->shape, arr->ndim);
- if (arr->strides != nullptr) {
- channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride);
- }
- if (arr->byte_offset != 0) {
- channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldByteOffset);
- }
+ SendDLTensor(channel, arr);
break;
}
case kTVMNullptr:
@@ -422,19 +446,7 @@ struct RPCReference {
break;
}
case kTVMDLTensorHandle: {
- uint64_t handle;
- channel->Read(&handle);
- DLTensor* arr = channel->template ArenaAlloc<DLTensor>(1);
- DLTensor& tensor = *arr;
- tensor.data = reinterpret_cast<void*>(handle);
- channel->Read(&(tensor.ctx));
- channel->Read(&(tensor.ndim));
- channel->Read(&(tensor.dtype));
- tensor.shape = channel->template ArenaAlloc<int64_t>(tensor.ndim);
- channel->ReadArray(tensor.shape, tensor.ndim);
- tensor.strides = nullptr;
- tensor.byte_offset = 0;
- value.v_handle = arr;
+ value.v_handle = ReceiveDLTensor(channel);
break;
}
default: {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index dae7756..d3ddbf8 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -24,6 +24,7 @@
#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/device_api.h>
#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
#include <tvm/support/logging.h>
#include "runtime_base.h"
@@ -58,36 +59,39 @@ inline void VerifyDataType(DLDataType dtype) {
ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
}
-inline size_t GetDataAlignment(const DLTensor& arr) {
- size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
- if (align < kAllocAlignment) return kAllocAlignment;
- return align;
-}
-
void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
- TVMContext cpu_ctx;
- cpu_ctx.device_type = kDLCPU;
- cpu_ctx.device_id = 0;
size_t arr_size = GetDataSize(*handle);
ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch";
ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now";
- DeviceAPI::Get(handle->ctx)
- ->CopyDataFromTo(data, 0, handle->data, static_cast<size_t>(handle->byte_offset), nbytes,
- cpu_ctx, handle->ctx, handle->dtype, nullptr);
+
+ DLTensor from;
+ from.data = const_cast<void*>(data);
+ from.ctx = DLContext{kDLCPU, 0};
+ from.ndim = handle->ndim;
+ from.dtype = handle->dtype;
+ from.shape = handle->shape;
+ from.strides = nullptr;
+ from.byte_offset = 0;
+ DeviceAPI::Get(handle->ctx)->CopyDataFromTo(&from, handle, nullptr);
// Synchronize in case data become unavailable later.
DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
}
void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
- TVMContext cpu_ctx;
- cpu_ctx.device_type = kDLCPU;
- cpu_ctx.device_id = 0;
size_t arr_size = GetDataSize(*handle);
ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch";
ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now";
- DeviceAPI::Get(handle->ctx)
- ->CopyDataFromTo(handle->data, static_cast<size_t>(handle->byte_offset), data, 0, nbytes,
- handle->ctx, cpu_ctx, handle->dtype, nullptr);
+
+ DLTensor to;
+ to.data = const_cast<void*>(data);
+ to.ctx = DLContext{kDLCPU, 0};
+ to.ndim = handle->ndim;
+ to.dtype = handle->dtype;
+ to.shape = handle->shape;
+ to.strides = nullptr;
+ to.byte_offset = 0;
+
+ DeviceAPI::Get(handle->ctx)->CopyDataFromTo(const_cast<DLTensor*>(handle), &to, nullptr);
// Synchronize in case data become unavailable later.
DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr);
}
@@ -186,13 +190,11 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); }
-NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
+NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx,
+ Optional<String> mem_scope) {
NDArray ret = Internal::Create(shape, dtype, ctx);
- // setup memory content
- size_t size = GetDataSize(ret.get_mutable()->dl_tensor);
- size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor);
- ret.get_mutable()->dl_tensor.data =
- DeviceAPI::Get(ret->ctx)->AllocDataSpace(ret->ctx, size, alignment, ret->dtype);
+ ret.get_mutable()->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace(
+ ret->ctx, shape.size(), shape.data(), ret->dtype, mem_scope);
return ret;
}
@@ -236,9 +238,7 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str
// api manager.
TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx;
- DeviceAPI::Get(ctx)->CopyDataFromTo(from->data, static_cast<size_t>(from->byte_offset), to->data,
- static_cast<size_t>(to->byte_offset), from_size, from->ctx,
- to->ctx, from->dtype, stream);
+ DeviceAPI::Get(ctx)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
}
std::vector<int64_t> NDArray::Shape() const { return get_mutable()->shape_; }
@@ -279,6 +279,17 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
API_END();
}
+TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args, TVMRetValue* ret) {
+ int64_t* shape_ptr = static_cast<int64_t*>(static_cast<void*>(args[0]));
+ int ndim = args[1];
+ std::vector<int64_t> shape(shape_ptr, shape_ptr + ndim);
+ DataType dtype = args[2];
+ TVMContext ctx = args[3];
+ Optional<String> mem_scope = args[4];
+ auto ndarray = NDArray::Empty(shape, dtype, ctx, mem_scope);
+ *ret = ndarray;
+});
+
int TVMArrayFree(TVMArrayHandle handle) {
API_BEGIN();
NDArray::Internal::FFIDecRef(handle);
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index fa118ed..2e7f05f 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -232,9 +232,6 @@ class OpenCLWorkspace : public DeviceAPI {
void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final;
void FreeDataSpace(TVMContext ctx, void* ptr) final;
- void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
- TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
- TVMStreamHandle stream) final;
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
void FreeWorkspace(TVMContext ctx, void* data) final;
@@ -246,6 +243,11 @@ class OpenCLWorkspace : public DeviceAPI {
// get the global workspace
static OpenCLWorkspace* Global();
+
+ protected:
+ void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+ TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
+ TVMStreamHandle stream) final;
};
/*! \brief Thread local workspace */
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index a1e96e9..06737f9 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -43,6 +43,18 @@ class RPCDeviceAPI final : public DeviceAPI {
GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv);
}
+ void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+ Optional<String> mem_scope) final {
+ auto sess = GetSess(ctx);
+ auto remote_ctx = RemoveRPCSessionMask(ctx);
+ void* data =
+ sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, ndim, shape, dtype, mem_scope);
+ RemoteSpace* space = new RemoteSpace();
+ space->data = data;
+ space->sess = std::move(sess);
+ return space;
+ }
+
void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
DLDataType type_hint) final {
auto sess = GetSess(ctx);
@@ -65,30 +77,36 @@ class RPCDeviceAPI final : public DeviceAPI {
}
delete space;
}
- void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
- TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
- TVMStreamHandle stream) final {
+
+ void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+ DLContext ctx_from = from->ctx;
+ DLContext ctx_to = to->ctx;
if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) {
ICHECK(ctx_from.device_type == ctx_to.device_type)
<< "Cannot copy across two different remote session";
- auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
- auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
- auto remote_ctx = remote_ctx_from;
- if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to;
- GetSess(ctx_from)
- ->GetDeviceAPI(remote_ctx)
- ->CopyDataFromTo(static_cast<const RemoteSpace*>(from)->data, from_offset,
- static_cast<const RemoteSpace*>(to)->data, to_offset, size,
- remote_ctx_from, remote_ctx_to, type_hint, stream);
+ DLTensor from_tensor = *from;
+ from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+ from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
+ DLTensor to_tensor = *to;
+ to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+ to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
+ auto remote_ctx = from_tensor.ctx;
+ if (remote_ctx.device_type == kDLCPU) remote_ctx = to_tensor.ctx;
+ GetSess(ctx_from)->GetDeviceAPI(remote_ctx)->CopyDataFromTo(&from_tensor, &to_tensor, stream);
} else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) {
- auto remote_ctx_from = RemoveRPCSessionMask(ctx_from);
- GetSess(ctx_from)->CopyFromRemote(static_cast<const RemoteSpace*>(from)->data, from_offset,
- to, to_offset, size, remote_ctx_from, type_hint);
+ DLTensor from_tensor = *from;
+ from_tensor.ctx = RemoveRPCSessionMask(ctx_from);
+ from_tensor.data = static_cast<const RemoteSpace*>(from->data)->data;
+ void* to_bytes = static_cast<char*>(to->data) + to->byte_offset;
+ size_t nbytes = GetDataSize(*to);
+ GetSess(ctx_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes);
} else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) {
- auto remote_ctx_to = RemoveRPCSessionMask(ctx_to);
- GetSess(ctx_to)->CopyToRemote(const_cast<void*>(from), from_offset,
- static_cast<const RemoteSpace*>(to)->data, to_offset, size,
- remote_ctx_to, type_hint);
+ DLTensor to_tensor = *to;
+ to_tensor.ctx = RemoveRPCSessionMask(ctx_to);
+ to_tensor.data = static_cast<const RemoteSpace*>(to->data)->data;
+ void* from_bytes = static_cast<char*>(from->data) + from->byte_offset;
+ size_t nbytes = GetDataSize(*from);
+ GetSess(ctx_to)->CopyToRemote(from_bytes, &to_tensor, nbytes);
} else {
LOG(FATAL) << "expect copy from/to remote or between remote";
}
@@ -99,6 +117,13 @@ class RPCDeviceAPI final : public DeviceAPI {
GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream);
}
+ protected:
+ void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
+ size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to,
+ DLDataType type_hint, TVMStreamHandle stream) final {
+ LOG(FATAL) << "Not implemented.";
+ }
+
private:
std::shared_ptr<RPCSession> GetSess(TVMContext ctx) {
int tbl_index = GetRPCSessionIndex(ctx);
diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc
index fbdd93f..8716355 100644
--- a/src/runtime/rpc/rpc_endpoint.cc
+++ b/src/runtime/rpc/rpc_endpoint.cc
@@ -387,88 +387,72 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
void HandleSyscall(RPCCode code);
void HandleCopyFromRemote() {
- uint64_t handle, offset, num_bytes;
- TVMContext ctx;
- DLDataType type_hint;
- this->Read(&handle);
- this->Read(&offset);
- this->Read(&num_bytes);
- this->Read(&ctx);
- this->Read(&type_hint);
- size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
-
+ DLTensor* arr = RPCReference::ReceiveDLTensor(this);
+ uint64_t data_bytes;
+ this->Read(&data_bytes);
+ size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
auto* sess = GetServingSession();
-
// Return Copy Ack with the given data
- auto fcopyack = [this](char* data_ptr, size_t num_bytes) {
+ auto fcopyack = [this](char* dptr, size_t num_bytes) {
RPCCode code = RPCCode::kCopyAck;
uint64_t packet_nbytes = sizeof(code) + num_bytes;
this->Write(packet_nbytes);
this->Write(code);
- this->WriteArray(data_ptr, num_bytes);
+ this->WriteArray(dptr, num_bytes);
this->SwitchToState(kRecvPacketNumBytes);
};
// When session is local, we can directly treat handle
// as the cpu pointer without allocating a temp space.
- if (ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
- char* data_ptr = reinterpret_cast<char*>(handle) + offset;
- fcopyack(data_ptr, num_bytes);
+ if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) {
+ char* data_ptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
+ fcopyack(data_ptr, data_bytes);
} else {
- char* data_ptr = this->ArenaAlloc<char>(num_bytes);
-
- auto on_copy_complete = [this, elem_bytes, num_bytes, data_ptr, fcopyack](RPCCode status,
- TVMArgs args) {
+ char* temp_data = this->ArenaAlloc<char>(data_bytes);
+ auto on_copy_complete = [this, elem_bytes, data_bytes, temp_data, fcopyack](RPCCode status,
+ TVMArgs args) {
if (status == RPCCode::kException) {
this->ReturnException(args.values[0].v_str);
this->SwitchToState(kRecvPacketNumBytes);
} else {
// endian aware handling
if (!DMLC_IO_NO_ENDIAN_SWAP) {
- dmlc::ByteSwap(data_ptr, elem_bytes, num_bytes / elem_bytes);
+ dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes);
}
- fcopyack(data_ptr, num_bytes);
+ fcopyack(temp_data, data_bytes);
}
};
this->SwitchToState(kWaitForAsyncCallback);
- sess->AsyncCopyFromRemote(reinterpret_cast<void*>(handle), offset, data_ptr, 0, num_bytes,
- ctx, type_hint, on_copy_complete);
+ sess->AsyncCopyFromRemote(arr, static_cast<void*>(temp_data), data_bytes, on_copy_complete);
}
}
void HandleCopyToRemote() {
- uint64_t handle, offset, num_bytes;
- TVMContext ctx;
- DLDataType type_hint;
-
- this->Read(&handle);
- this->Read(&offset);
- this->Read(&num_bytes);
- this->Read(&ctx);
- this->Read(&type_hint);
-
- size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
+ DLTensor* arr = RPCReference::ReceiveDLTensor(this);
+ uint64_t data_bytes;
+ this->Read(&data_bytes);
+ size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8;
auto* sess = GetServingSession();
// When session is local, we can directly treat handle
// as the cpu pointer without allocating a temp space.
- if (ctx.device_type == kDLCPU && sess->IsLocalSession()) {
- char* dptr = reinterpret_cast<char*>(handle) + offset;
- this->ReadArray(dptr, num_bytes);
+ if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession()) {
+ char* dptr = reinterpret_cast<char*>(arr->data) + arr->byte_offset;
+ this->ReadArray(dptr, data_bytes);
if (!DMLC_IO_NO_ENDIAN_SWAP) {
- dmlc::ByteSwap(dptr, elem_bytes, num_bytes / elem_bytes);
+ dmlc::ByteSwap(dptr, elem_bytes, data_bytes / elem_bytes);
}
this->ReturnVoid();
this->SwitchToState(kRecvPacketNumBytes);
} else {
- char* temp_data = this->ArenaAlloc<char>(num_bytes);
- this->ReadArray(temp_data, num_bytes);
+ char* temp_data = this->ArenaAlloc<char>(data_bytes);
+ this->ReadArray(temp_data, data_bytes);
if (!DMLC_IO_NO_ENDIAN_SWAP) {
- dmlc::ByteSwap(temp_data, elem_bytes, num_bytes / elem_bytes);
+ dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes);
}
auto on_copy_complete = [this](RPCCode status, TVMArgs args) {
@@ -482,8 +466,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream {
};
this->SwitchToState(kWaitForAsyncCallback);
- sess->AsyncCopyToRemote(temp_data, 0, reinterpret_cast<void*>(handle), offset, num_bytes, ctx,
- type_hint, on_copy_complete);
+ sess->AsyncCopyToRemote(static_cast<void*>(temp_data), arr, data_bytes, on_copy_complete);
}
}
@@ -815,51 +798,47 @@ void RPCEndpoint::CallFunc(RPCSession::PackedFuncHandle h, const TVMValue* arg_v
ICHECK(code == RPCCode::kReturn) << "code=" << static_cast<int>(code);
}
-void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
- size_t data_size, TVMContext ctx_to, DLDataType type_hint) {
+void RPCEndpoint::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) {
std::lock_guard<std::mutex> lock(mutex_);
RPCCode code = RPCCode::kCopyToRemote;
- uint64_t handle = reinterpret_cast<uint64_t>(to);
- uint64_t offset = static_cast<uint64_t>(to_offset);
- uint64_t size = static_cast<uint64_t>(data_size);
- uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) +
- sizeof(ctx_to) + sizeof(type_hint) + data_size;
+ uint64_t num_data_bytes = static_cast<uint64_t>(GetDataSize(*to));
+ ICHECK_EQ(nbytes, num_data_bytes);
+
+ uint64_t to_data = reinterpret_cast<uint64_t>(to->data);
+ uint64_t shape_bytes = to->ndim * sizeof(int64_t);
+ uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->ctx) + sizeof(to->ndim) +
+ sizeof(to->dtype) + sizeof(to->byte_offset) + shape_bytes +
+ sizeof(nbytes) + num_data_bytes;
handler_->Write(packet_nbytes);
handler_->Write(code);
- handler_->Write(handle);
- handler_->Write(offset);
- handler_->Write(size);
- handler_->Write(ctx_to);
- handler_->Write(type_hint);
- handler_->WriteArray(reinterpret_cast<char*>(from) + from_offset, data_size);
-
+ RPCReference::SendDLTensor(handler_, to);
+ handler_->Write(nbytes);
+ handler_->WriteArray(reinterpret_cast<char*>(from_bytes), nbytes);
ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn);
}
-void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
- size_t data_size, TVMContext ctx_from, DLDataType type_hint) {
+void RPCEndpoint::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) {
std::lock_guard<std::mutex> lock(mutex_);
RPCCode code = RPCCode::kCopyFromRemote;
- uint64_t handle = reinterpret_cast<uint64_t>(from);
- uint64_t offset = static_cast<uint64_t>(from_offset);
- uint64_t size = static_cast<uint64_t>(data_size);
- uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) +
- sizeof(ctx_from) + sizeof(type_hint);
+ uint64_t num_data_bytes = static_cast<uint64_t>(GetDataSize(*from));
+ CHECK_EQ(nbytes, num_data_bytes);
+
+ uint64_t from_data = reinterpret_cast<uint64_t>(from->data);
+ uint64_t shape_bytes = from->ndim * sizeof(int64_t);
+ uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->ctx) +
+ sizeof(from->ndim) + sizeof(from->dtype) + sizeof(from->byte_offset) +
+ shape_bytes + sizeof(nbytes);
handler_->Write(packet_nbytes);
handler_->Write(code);
- handler_->Write(handle);
- handler_->Write(offset);
- handler_->Write(size);
- handler_->Write(ctx_from);
- handler_->Write(type_hint);
-
- TVMRetValue rv;
+ RPCReference::SendDLTensor(handler_, from);
+ handler_->Write(nbytes);
ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck);
- handler_->ReadArray(reinterpret_cast<char*>(to) + to_offset, data_size);
+
+ handler_->ReadArray(reinterpret_cast<char*>(to_bytes), nbytes);
handler_->FinishCopyAck();
}
@@ -904,6 +883,23 @@ void RPCDevAllocData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
*rv = data;
}
+void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
+ DLTensor* arr = args[0];
+ TVMContext ctx = arr->ctx;
+ int ndim = arr->ndim;
+ int64_t* shape = arr->shape;
+ DLDataType dtype = arr->dtype;
+ int tcode = args[1].type_code();
+ Optional<String> mem_scope = NullOpt;
+ if (tcode == kTVMStr) {
+ mem_scope = args[1].operator String();
+ } else {
+ ICHECK_EQ(tcode, kTVMNullptr);
+ }
+ void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, mem_scope);
+ *rv = data;
+}
+
void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
TVMContext ctx = args[0];
void* ptr = args[1];
@@ -911,25 +907,18 @@ void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
}
void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
- void* from = args[0];
- uint64_t from_offset = args[1];
- void* to = args[2];
- uint64_t to_offset = args[3];
- uint64_t size = args[4];
- TVMContext ctx_from = args[5];
- TVMContext ctx_to = args[6];
- DLDataType type_hint = args[7];
- TVMStreamHandle stream = args[8];
- TVMContext ctx = ctx_from;
+ DLTensor* from = args[0];
+ DLTensor* to = args[1];
+ TVMStreamHandle stream = args[2];
+ TVMContext ctx = from->ctx;
if (ctx.device_type == kDLCPU) {
- ctx = ctx_to;
+ ctx = to->ctx;
} else {
- ICHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type)
+ ICHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type)
<< "Can not copy across different ctx types directly";
}
- handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from,
- ctx_to, type_hint, stream);
+ handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, to, stream);
}
void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
@@ -951,6 +940,9 @@ void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) {
case RPCCode::kDevAllocData:
SysCallHandler(RPCDevAllocData);
break;
+ case RPCCode::kDevAllocDataWithScope:
+ SysCallHandler(RPCDevAllocDataWithScope);
+ break;
case RPCCode::kDevFreeData:
SysCallHandler(RPCDevFreeData);
break;
@@ -989,14 +981,12 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
endpoint_->CallFunc(func, arg_values, arg_type_codes, num_args, fencode_return);
}
- void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
- TVMContext ctx_to, DLDataType type_hint) final {
- endpoint_->CopyToRemote(from, from_offset, to, to_offset, nbytes, ctx_to, type_hint);
+ void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) final {
+ endpoint_->CopyToRemote(local_from_bytes, remote_to, nbytes);
}
- void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
- TVMContext ctx_from, DLDataType type_hint) final {
- endpoint_->CopyFromRemote(from, from_offset, to, to_offset, nbytes, ctx_from, type_hint);
+ void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) final {
+ endpoint_->CopyFromRemote(remote_from, local_to_bytes, nbytes);
}
void FreeHandle(void* handle, int type_code) final {
@@ -1019,15 +1009,30 @@ class RPCClientSession : public RPCSession, public DeviceAPI {
return endpoint_->SysCallRemote(RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint);
}
+ void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+ Optional<String> mem_scope) final {
+ DLTensor temp;
+ temp.data = nullptr;
+ temp.ctx = ctx;
+ temp.ndim = ndim;
+ temp.dtype = dtype;
+ temp.shape = const_cast<int64_t*>(shape);
+ temp.strides = nullptr;
+ temp.byte_offset = 0;
+ if (mem_scope.defined()) {
+ return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp,
+ static_cast<std::string>(mem_scope.value()));
+ } else {
+ return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp, nullptr);
+ }
+ }
+
void FreeDataSpace(TVMContext ctx, void* ptr) final {
endpoint_->SysCallRemote(RPCCode::kDevFreeData, ctx, ptr);
}
- void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
- TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
- TVMStreamHandle stream) final {
- endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, const_cast<void*>(from), from_offset, to,
- to_offset, size, ctx_from, ctx_to, type_hint, stream);
+ void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+ endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream);
}
void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index 031435f..8e08bfa 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -135,8 +135,7 @@ class RPCEndpoint {
* \param ctx_to The target context.
* \param type_hint Hint of content data type.
*/
- void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
- TVMContext ctx_to, DLDataType type_hint);
+ void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes);
/*!
* \brief Copy bytes from remote array content.
* \param from The source host data.
@@ -147,8 +146,7 @@ class RPCEndpoint {
* \param ctx_from The source context.
* \param type_hint Hint of content data type.
*/
- void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
- TVMContext ctx_from, DLDataType type_hint);
+ void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes);
/*!
* \brief Call a remote defined system function with arguments.
diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc
index b35c62d..0650b55d 100644
--- a/src/runtime/rpc/rpc_local_session.cc
+++ b/src/runtime/rpc/rpc_local_session.cc
@@ -87,26 +87,36 @@ void LocalSession::CallFunc(RPCSession::PackedFuncHandle func, const TVMValue* a
this->EncodeReturn(std::move(rv), encode_return);
}
-void LocalSession::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset,
- size_t nbytes, TVMContext ctx_to, DLDataType type_hint) {
- TVMContext cpu_ctx;
- cpu_ctx.device_type = kDLCPU;
- cpu_ctx.device_id = 0;
- this->GetDeviceAPI(ctx_to)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, cpu_ctx,
- ctx_to, type_hint, nullptr);
+void LocalSession::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) {
+ ICHECK_EQ(nbytes, GetDataSize(*to));
+ DLTensor from;
+ from.data = from_bytes;
+ from.ctx = {kDLCPU, 0};
+ from.ndim = to->ndim;
+ from.shape = to->shape;
+ from.dtype = to->dtype;
+ from.strides = nullptr;
+ from.byte_offset = 0;
+ TVMContext ctx_to = to->ctx;
+ this->GetDeviceAPI(ctx_to)->CopyDataFromTo(&from, to, nullptr);
// Copy can happen asynchrously
// synchronize to make sure that copy is completed
this->GetDeviceAPI(ctx_to)->StreamSync(ctx_to, nullptr);
}
-void LocalSession::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset,
- size_t nbytes, TVMContext ctx_from, DLDataType type_hint) {
- TVMContext cpu_ctx;
- cpu_ctx.device_type = kDLCPU;
- cpu_ctx.device_id = 0;
-
- this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, ctx_from,
- cpu_ctx, type_hint, nullptr);
+void LocalSession::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) {
+ ICHECK_EQ(nbytes, GetDataSize(*from));
+ DLTensor to;
+ to.data = to_bytes;
+ to.ctx = {kDLCPU, 0};
+ to.ndim = from->ndim;
+ to.shape = from->shape;
+ to.dtype = from->dtype;
+ to.strides = nullptr;
+ to.byte_offset = 0;
+
+ TVMContext ctx_from = from->ctx;
+ this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, &to, nullptr);
// Copy can happen asynchrously
// synchronize to make sure that copy is completed
this->GetDeviceAPI(ctx_from)->StreamSync(ctx_from, nullptr);
diff --git a/src/runtime/rpc/rpc_local_session.h b/src/runtime/rpc/rpc_local_session.h
index 7a67ce8..ea070e3 100644
--- a/src/runtime/rpc/rpc_local_session.h
+++ b/src/runtime/rpc/rpc_local_session.h
@@ -48,11 +48,9 @@ class LocalSession : public RPCSession {
void CallFunc(PackedFuncHandle func, const TVMValue* arg_values, const int* arg_type_codes,
int num_args, const FEncodeReturn& fencode_return) override;
- void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
- TVMContext ctx_to, DLDataType type_hint) override;
+ void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) override;
- void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes,
- TVMContext ctx_from, DLDataType type_hint) override;
+ void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) override;
void FreeHandle(void* handle, int type_code) override;
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index f5405f0..0ac5b8d 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -51,33 +51,28 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values
}
}
-void RPCSession::AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
- size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
- DLDataType type_hint, RPCSession::FAsyncCallback callback) {
+void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+ RPCSession::FAsyncCallback callback) {
TVMValue value;
int32_t tcode = kTVMNullptr;
value.v_handle = nullptr;
try {
- this->CopyToRemote(local_from, local_from_offset, remote_to, remote_to_offset, nbytes,
- remote_ctx_to, type_hint);
+ this->CopyToRemote(local_from_bytes, remote_to, nbytes);
callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
} catch (const std::runtime_error& e) {
this->SendException(callback, e.what());
}
}
-void RPCSession::AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
- size_t local_to_offset, size_t nbytes,
- TVMContext remote_ctx_from, DLDataType type_hint,
+void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
RPCSession::FAsyncCallback callback) {
TVMValue value;
int32_t tcode = kTVMNullptr;
value.v_handle = nullptr;
try {
- this->CopyFromRemote(remote_from, remote_from_offset, local_to, local_to_offset, nbytes,
- remote_ctx_from, type_hint);
+ this->CopyFromRemote(remote_from, local_to_bytes, nbytes);
callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1));
} catch (const std::runtime_error& e) {
this->SendException(callback, e.what());
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 4ea937a..4b942f2 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -127,30 +127,18 @@ class RPCSession {
/*!
* \brief Copy bytes into remote array content.
- * \param local_from The source host data.
- * \param local_from_offset The byte offeset in the from.
+ * \param local_from_bytes The source host data.
* \param remote_to The target array.
- * \param remote_to_offset The byte offset in the to.
* \param nbytes The size of the memory in bytes.
- * \param remote_ctx_to The target context.
- * \param type_hint Hint of content data type.
*/
- virtual void CopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
- size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
- DLDataType type_hint) = 0;
+ virtual void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) = 0;
/*!
* \brief Copy bytes from remote array content.
* \param remote_from The source host data.
- * \param remote_from_offset The byte offeset in the from.
- * \param to The target array.
- * \param to_offset The byte offset in the to.
+ * \param local_to_bytes The target array.
* \param nbytes The size of the memory in bytes.
- * \param remote_ctx_from The source context in the remote.
- * \param type_hint Hint of content data type.
*/
- virtual void CopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
- size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from,
- DLDataType type_hint) = 0;
+ virtual void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) = 0;
/*!
* \brief Free a remote function.
@@ -223,40 +211,27 @@ class RPCSession {
/*!
* \brief Asynchrous version of CopyToRemote.
*
- * \param local_from The source host data.
- * \param local_from_offset The byte offeset in the from.
+ * \param local_from_bytes The source host data.
* \param remote_to The target array.
- * \param remote_to_offset The byte offset in the to.
* \param nbytes The size of the memory in bytes.
- * \param remote_ctx_to The target context.
- * \param type_hint Hint of content data type.
- *
* \param on_complete The callback to signal copy complete.
* \note All the allocated memory in local_from, and remote_to
* must stay alive until on_compelete is called.
*/
- virtual void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
- size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
- DLDataType type_hint, FAsyncCallback on_complete);
+ virtual void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+ FAsyncCallback on_complete);
/*!
* \brief Asynchrous version of CopyFromRemote.
*
* \param remote_from The source host data.
- * \param remote_from_offset The byte offeset in the from.
- * \param to The target array.
- * \param to_offset The byte offset in the to.
+ * \param local_to_bytes The target array.
* \param nbytes The size of the memory in bytes.
- * \param remote_ctx_from The source context in the remote.
- * \param type_hint Hint of content data type.
- *
* \param on_complete The callback to signal copy complete.
* \note All the allocated memory in remote_from, and local_to
* must stay alive until on_compelete is called.
*/
- virtual void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
- size_t local_to_offset, size_t nbytes,
- TVMContext remote_ctx_from, DLDataType type_hint,
+ virtual void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
FAsyncCallback on_complete);
/*!
* \brief Asynchrously wait for all events in ctx, stream compeletes.
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
index cbf1974..f40fd80 100644
--- a/src/runtime/vulkan/vulkan.cc
+++ b/src/runtime/vulkan/vulkan.cc
@@ -199,6 +199,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
delete pbuf;
}
+ protected:
void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream) final {
@@ -307,6 +308,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
}
}
+ public:
// Always use the default stream
TVMStreamHandle CreateStream(TVMContext ctx) {
LOG(FATAL) << "Not implemented";
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index 6abd122..b72caad 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -177,33 +177,37 @@ class AsyncLocalSession : public LocalSession {
}
}
- void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to,
- size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to,
- DLDataType type_hint, FAsyncCallback on_complete) final {
- TVMContext cpu_ctx;
- cpu_ctx.device_type = kDLCPU;
- cpu_ctx.device_id = 0;
+ void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes,
+ FAsyncCallback on_complete) final {
try {
- this->GetDeviceAPI(remote_ctx_to)
- ->CopyDataFromTo(local_from, local_from_offset, remote_to, remote_to_offset, nbytes,
- cpu_ctx, remote_ctx_to, type_hint, nullptr);
- this->AsyncStreamWait(remote_ctx_to, nullptr, on_complete);
+ DLTensor local_from;
+ local_from.data = local_from_bytes;
+ local_from.ctx = TVMContext{kDLCPU, 0};
+ local_from.ndim = remote_to->ndim;
+ local_from.shape = remote_to->shape;
+ local_from.dtype = remote_to->dtype;
+ local_from.strides = nullptr;
+ local_from.byte_offset = 0;
+ this->GetDeviceAPI(remote_to->ctx)->CopyDataFromTo(&local_from, remote_to, nullptr);
+ this->AsyncStreamWait(remote_to->ctx, nullptr, on_complete);
} catch (const std::runtime_error& e) {
this->SendException(on_complete, e.what());
}
}
- void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to,
- size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from,
- DLDataType type_hint, FAsyncCallback on_complete) final {
- TVMContext cpu_ctx;
- cpu_ctx.device_type = kDLCPU;
- cpu_ctx.device_id = 0;
+ void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes,
+ FAsyncCallback on_complete) final {
try {
- this->GetDeviceAPI(remote_ctx_from)
- ->CopyDataFromTo(remote_from, remote_from_offset, local_to, local_to_offset, nbytes,
- remote_ctx_from, cpu_ctx, type_hint, nullptr);
- this->AsyncStreamWait(remote_ctx_from, nullptr, on_complete);
+ DLTensor local_to;
+ local_to.data = local_to_bytes;
+ local_to.ctx = TVMContext{kDLCPU, 0};
+ local_to.ndim = remote_from->ndim;
+ local_to.shape = remote_from->shape;
+ local_to.dtype = remote_from->dtype;
+ local_to.strides = nullptr;
+ local_to.byte_offset = 0;
+ this->GetDeviceAPI(remote_from->ctx)->CopyDataFromTo(&local_to, remote_from, nullptr);
+ this->AsyncStreamWait(remote_from->ctx, nullptr, on_complete);
} catch (const std::runtime_error& e) {
this->SendException(on_complete, e.what());
}
diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc
index 54601e3..62b87af 100644
--- a/web/emcc/webgpu_runtime.cc
+++ b/web/emcc/webgpu_runtime.cc
@@ -82,6 +82,7 @@ class WebGPUDeviceAPI : public DeviceAPI {
void FreeDataSpace(TVMContext ctx, void* ptr) final { return free_space_(ptr); }
+ protected:
void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint,
TVMStreamHandle stream) final {
@@ -102,6 +103,7 @@ class WebGPUDeviceAPI : public DeviceAPI {
}
}
+ public:
TVMStreamHandle CreateStream(TVMContext ctx) final {
LOG(FATAL) << "Not implemented";
return nullptr;