You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "lidavidm (via GitHub)" <gi...@apache.org> on 2023/05/11 12:52:18 UTC
[GitHub] [arrow] lidavidm commented on a diff in pull request #34972: GH-34971: [Format] Enhance C-Data API to support non-cpu cases

lidavidm commented on code in PR #34972:
URL: https://github.com/apache/arrow/pull/34972#discussion_r1191133057


##########
cpp/src/arrow/c/abi.h:
##########
@@ -65,6 +85,111 @@ struct ArrowArray {
 
 #endif  // ARROW_C_DATA_INTERFACE
 
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure 
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU = 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA = 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST = 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL = 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN = 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL = 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI = 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM = 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCMHOST = 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV = 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED = 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI = 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU = 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON = 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types

Review Comment:
   Should we spell out the expected type for each device?



##########
cpp/src/arrow/c/abi.h:
##########
@@ -106,6 +231,78 @@ struct ArrowArrayStream {
 
 #endif  // ARROW_C_STREAM_INTERFACE
 
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. The device_type needs to be provided here
+  /// so that consumers can provide the correct type
+  /// of queue_ptr when calling get_next.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// Because different frameworks use different types to represent this, we
+  /// accept a void* which should then be reinterpreted into whatever the
+  /// appropriate type is (e.g. cudaStream_t) for use by the producer.

Review Comment:
   Outdated?



##########
cpp/src/arrow/c/abi.h:
##########
@@ -65,6 +85,111 @@ struct ArrowArray {
 
 #endif  // ARROW_C_DATA_INTERFACE
 
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure 
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU = 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA = 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST = 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL = 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN = 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL = 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI = 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM = 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCMHOST = 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV = 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED = 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI = 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU = 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON = 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP). 
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after 
+  /// allocation in order to ensure safe evolution of the ABI in 
+  /// the future.
+  int64_t reserved[3];
+  int32_t reserved_addl;

Review Comment:
   Is this padding needed? (ArrowDeviceType is 32 bits, but there's 4 bytes of padding after it so that the 64 bit field after it is aligned.)
   
   Also:
   
   ```
   > g++ abi.h 
   abi.h:308:40: error: static assertion failed
     308 | static_assert(sizeof(ArrowDeviceArray) == 128, "");
         |               ~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org