You are viewing a plain text version of this content. The canonical link for it is here.

Posted to github@arrow.apache.org by "paleolimbot (via GitHub)" <gi...@apache.org> on 2023/05/25 17:39:18 UTC

[GitHub] [arrow-nanoarrow] paleolimbot opened a new pull request, #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

paleolimbot opened a new pull request, #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205

   After:
   
   - https://github.com/zeroshade/arrow-non-cpu/tree/main
   - https://lists.apache.org/thread/o2hsw7o1gm3qgw5z51rmz6zqxh0p7bvk
   - https://github.com/apache/arrow/pull/34972
   
   Still in very much draft form with many open questions. This is basically an exercise in figuring out what set of generic helpers can be enabled given the constraints of the ABI. Like nanoarrow core, enough functionality to make test arrays and streams is a must.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226823755


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;

Review Comment:
   As above, this is a hangover from early attempts to get Metal to fit within this framework and I think it's probably best to remove it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] kkraus14 commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "kkraus14 (via GitHub)" <gi...@apache.org>.

kkraus14 commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235783661


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);

Review Comment:
   Correct, the addresses do not change, just the data pointed to is undefined until synchronized



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1238822194


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));

Review Comment:
   It seems like we do need to do the bad thing here and wait for a sync before calling `cudaMemcpy()` here for the GPU -> CPU direction (although hopefully this is now isolated such that it won't get accidentally called by somebody who does not need this).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] zeroshade commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.

zeroshade commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235503005


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;

Review Comment:
   Yea, it should only get set to null if the `release` callback is called (just to prevent the dangling pointer) which will clean up the event itself.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237073509


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;

Review Comment:
   Ok, this one should be handled now (I don't have a multi-GPU system to test on but all the right information is piped to the right places for when this is something that can be tested).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235468628


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal.cc:
##########
@@ -0,0 +1,331 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#define NS_PRIVATE_IMPLEMENTATION
+#define MTL_PRIVATE_IMPLEMENTATION
+#include <Metal/Metal.hpp>
+
+#include "nanoarrow_device.hpp"
+
+#include "nanoarrow_device_metal.h"
+
+// If non-null, caller must ->release() the return value. This doesn't
+// release the underlying memory (which must be managed separately).
+static MTL::Buffer* ArrowDeviceMetalWrapBufferNonOwning(MTL::Device* mtl_device,
+                                                        const void* arbitrary_addr,
+                                                        int64_t size_bytes) {
+  // We can wrap any zero-size buffer
+  if (size_bytes == 0) {
+    return mtl_device->newBuffer(0, MTL::ResourceStorageModeShared);
+  }
+
+  // Cache the page size from the system call
+  static int pagesize = 0;
+  if (pagesize == 0) {
+    pagesize = getpagesize();
+  }
+
+  int64_t allocation_size;
+  if (size_bytes % pagesize == 0) {
+    allocation_size = size_bytes;
+  } else {
+    allocation_size = (size_bytes / pagesize) + 1 * pagesize;
+  }
+
+  // Will return nullptr if the memory is improperly aligned
+  return mtl_device->newBuffer(arbitrary_addr, allocation_size,
+                               MTL::ResourceStorageModeShared, nullptr);
+}
+
+static uint8_t* ArrowDeviceMetalAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  // Cache the page size from the system call
+  static int pagesize = 0;
+  if (pagesize == 0) {
+    pagesize = getpagesize();
+  }
+
+  int64_t allocation_size;
+  if (new_size % pagesize == 0) {
+    allocation_size = new_size;
+  } else {
+    allocation_size = (new_size / pagesize) + 1 * pagesize;
+  }
+
+  // If growing an existing buffer but the allocation size is still big enough,
+  // return the same pointer and do nothing.
+  if (ptr != nullptr && new_size >= old_size && new_size <= allocation_size) {
+    return ptr;
+  }
+
+  int64_t copy_size;
+  if (new_size > old_size) {
+    copy_size = old_size;
+  } else {
+    copy_size = new_size;
+  }
+
+  void* new_ptr = nullptr;
+  posix_memalign(&new_ptr, pagesize, allocation_size);
+  if (new_ptr != nullptr && ptr != nullptr) {
+    memcpy(new_ptr, ptr, copy_size);
+  }
+
+  if (ptr != nullptr) {
+    free(ptr);
+  }
+
+  return reinterpret_cast<uint8_t*>(new_ptr);
+}
+
+static void ArrowDeviceMetalAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                          uint8_t* ptr, int64_t old_size) {
+  free(ptr);
+}
+
+void ArrowDeviceMetalInitBuffer(struct ArrowBuffer* buffer) {
+  buffer->allocator.reallocate = &ArrowDeviceMetalAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceMetalAllocatorFree;
+  buffer->allocator.private_data = nullptr;
+  buffer->data = nullptr;

Review Comment:
   For CUDA we definitely need this...for Metal as implemented here this is just `posix_memalign()` and `free()` regardless of the device identifier.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237108140


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));

Review Comment:
   I think I correctly separated the case where it's needed (it *is* needed before copy to CPU, correct? Or is that synchronizatio handled by `cudaMemcpy()`?).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237109714


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result = cudaMemcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes,
+                                    cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result = cudaMemcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes,
+                                    cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result = cudaMemcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes,
+                                    cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, src.data.as_uint8, (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, src.data.as_uint8, (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes,
+                                    cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes,
+                                    cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes,
+                                    cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes);
+    return NANOARROW_OK;

Review Comment:
   Ok, these are all `cudaMemcpyHostToHost()` now.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] kkraus14 commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "kkraus14 (via GitHub)" <gi...@apache.org>.

kkraus14 commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237463786


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {

Review Comment:
   Yea, issuing two single element copies for getting the starting and ending offset into the CPU is necessary, but once we have those offsets, we should just do pointer arithmetic to get a pointer and a size to feed into a copy call.
   
   Ideally you want to issue the two individual element copies asynchronously (potentially on different streams so they can be overlapped even though they're tiny), synchronize the stream(s) since you need those values to use in host code, and then issue the actual data copy.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237096372


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(

Review Comment:
   I added these comments to the documentation and also added `ArrowDeviceArrayViewSetArrayMinimal()` that doesn't do any copy to/from device.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235463135


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {

Review Comment:
   I'm not sure I understand where that would be used? It's definitely suboptimal to issue copies in this way (but your suggestion of skipping validation and Keith's suggestion of leveraging async memcpy may be a workaround).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235460721


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;

Review Comment:
   I probably just misunderstood/didn't read the spec carefully enough. I suppose this should always point to a valid `cudaEvent_t` or similar even if synchronization has already happened?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237037841


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;

Review Comment:
   I forgot that I already had `ArrowBufferDeallocator()` to handle this pattern (which simplifies this section considerably)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237063299


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;

Review Comment:
   Ok, this is all removed now.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235483947


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;
+
+  /// \brief The size of the buffer in bytes
+  int64_t size_bytes;
+};
+
+/// \brief A Device wrapper with callbacks for basic memory management tasks
+///
+/// All device objects are currently implemented as singletons; however, this
+/// may change as implementations progress.
+struct ArrowDevice {
+  /// \brief The device type integer identifier (see ArrowDeviceArray)
+  ArrowDeviceType device_type;
+
+  /// \brief The device identifier (see ArrowDeviceArray)
+  int64_t device_id;
+
+  /// \brief Initialize an owning buffer from existing content
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// copying existing content.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_init)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Move an owning buffer to a device
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// moving an existing buffer. If NANOARROW_OK is returned, src will have
+  /// been released or moved by the implementation and dst must be released by
+  /// the caller.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_move)(struct ArrowDevice* device_src, struct ArrowBuffer* src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);

Review Comment:
   The idea was that an implementation might be able to handle a few directions, although it does result in verbose method implementations. For the CUDA case the generality is somewhat useful...it can also theoretically move a buffer from CUDA_HOST to CUDA and it might be difficult to construct a method signature that captures that. That generality might also not be useful 🤷 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237088268


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {

Review Comment:
   Ok, I updated the function name to `ArrowDeviceArrayViewResolveBufferSizes()` since that's what it's actually doing and added `ArrowDeviceArrayViewSetArrayMinimal()` that just sets `array_view->buffers[i].size_bytes` to `-1` if it would require a copy to calculate.
   
   For the case of "just get me the pointer value", I don't think there needs to a be a function (`array_view->buffers[i].data.as_int32 + some_logical_offset` would do it).
   
   For the case where we copy back to the CPU, I don't see a way around copying the last int32/int64 from the offsets buffer (or else there is no way to know how many bytes of the next buffer to copy). We can possibly mitigate the impact of that by asynchronously kicking off all the tiny copies at once?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237102158


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);

Review Comment:
   Ok...validation is now off the table (never performed) and resolving all buffer sizes is now optional.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] kkraus14 commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "kkraus14 (via GitHub)" <gi...@apache.org>.

kkraus14 commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1227365627


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;

Review Comment:
   `cuMemHostGetDevicePointer()` gets a pointer to that pinned host memory that can be used from device code but doesn't actually copy any memory to device memory. As far as I know it can be used anywhere that device memory can be used, but obviously has different performance characteristics where that would likely be very unexpected.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237105636


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }

Review Comment:
   Probably not today...the main motivation of the Metal implementation is to demonstrate that the `ArrowDeviceArray` works for more than just CUDA (or else there would be no point to having it in anything except a CUDA header).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235471660


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {

Review Comment:
   It's poorly thought-out on my part...I think the idea was that a GPU wouldn't have to wait on itself but the GPU *would* have to wait on the CPU. This should for sure be simplified.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] kkraus14 commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "kkraus14 (via GitHub)" <gi...@apache.org>.

kkraus14 commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1238823590


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));

Review Comment:
   We should sync after calling the `cudaMemcpy()` as opposed to before it. Otherwise, you could in theory get into a situation where that device to host copy is asynchronous (if you have pinned host memory for example) and accessing it from the CPU without synchronization is a race condition.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot merged pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot merged PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] kkraus14 commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "kkraus14 (via GitHub)" <gi...@apache.org>.

kkraus14 commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235564700


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,

Review Comment:
   +1 to the int32 function here



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {

Review Comment:
   Definitely don't want to issue copies one by one like this even asynchronously, that would be really really bad performance wise and put significant pressure on the system via the GPU driver.
   
   I would +1 @zeroshade's suggestion of skipping validation and generally anything that needs to introspect the data.



##########
extensions/nanoarrow_device/CMakeLists.txt:
##########
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+message(STATUS "Building using CMake version: ${CMAKE_VERSION}")
+cmake_minimum_required(VERSION 3.14)
+include(FetchContent)
+
+if(NOT DEFINED CMAKE_C_STANDARD)
+  set(CMAKE_C_STANDARD 11)
+endif()
+
+project(nanoarrow_device)
+
+option(NANOARROW_DEVICE_BUILD_TESTS "Build tests" OFF)
+option(NANOARROW_DEVICE_BUNDLE "Create bundled nanoarrow_device.h and nanoarrow_device.c" OFF)
+option(NANOARROW_DEVICE_WITH_METAL "Build Apple metal extension" OFF)
+option(NANOARROW_DEVICE_WITH_CUDA "Build CUDA extension" OFF)
+
+
+option(NANOARROW_DEVICE_CODE_COVERAGE "Enable coverage reporting" OFF)
+add_library(device_coverage_config INTERFACE)
+
+if (NANOARROW_DEVICE_BUILD_TESTS OR NOT NANOARROW_DEVICE_BUNDLE)
+  # Add the nanoarrow dependency. nanoarrow is not linked into the
+  # nanoarrow_device library (the caller must link this themselves);
+  # however, we need nanoarrow.h to build nanoarrow_device.c.
+  FetchContent_Declare(
+    nanoarrow
+    SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
+
+  # Don't install nanoarrow because of this configuration
+  FetchContent_GetProperties(nanoarrow)
+  if(NOT nanoarrow_POPULATED)
+    FetchContent_Populate(nanoarrow)
+    add_subdirectory(${nanoarrow_SOURCE_DIR} ${nanoarrow_BINARY_DIR} EXCLUDE_FROM_ALL)
+  endif()
+endif()
+
+if (NANOARROW_DEVICE_BUNDLE)
+  # The CMake build step is creating nanoarrow_device.c and nanoarrow_device.h;
+  # the CMake install step is copying them to a specific location
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation)
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation/nanoarrow)
+
+  # nanoarrow_device.h is currently standalone
+  set(NANOARROW_DEVICE_H_TEMP ${CMAKE_BINARY_DIR}/amalgamation/nanoarrow/nanoarrow_device.h)
+  file(READ src/nanoarrow/nanoarrow_device.h SRC_FILE_CONTENTS)
+  file(WRITE ${NANOARROW_DEVICE_H_TEMP} "${SRC_FILE_CONTENTS}")
+
+  # nanoarrow_device.c is currently standalone
+  set(NANOARROW_DEVICE_C_TEMP ${CMAKE_BINARY_DIR}/amalgamation/nanoarrow/nanoarrow_device.c)
+  file(READ src/nanoarrow/nanoarrow_device.c SRC_FILE_CONTENTS)
+  file(WRITE ${NANOARROW_DEVICE_C_TEMP} "${SRC_FILE_CONTENTS}")
+
+  # Add a library that the tests can link against (but don't install it)
+  if(NANOARROW_DEVICE_BUILD_TESTS)
+    add_library(nanoarrow_device ${NANOARROW_DEVICE_C_TEMP})
+
+    target_include_directories(nanoarrow_device PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+      $<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src/nanoarrow>
+      $<BUILD_INTERFACE:${nanoarrow_BINARY_DIR}/generated>
+      $<BUILD_INTERFACE:${NANOARROW_DEVICE_FLATCC_INCLUDE_DIR}>)
+  endif()
+
+  # Install the amalgamated header and sources
+  install(FILES
+    ${NANOARROW_DEVICE_H_TEMP}
+    ${NANOARROW_DEVICE_C_TEMP}
+    DESTINATION ".")
+else()
+  # This is a normal CMake build that builds + installs some includes and a static lib
+  if (NANOARROW_DEVICE_WITH_METAL)
+    if (NOT EXISTS "${CMAKE_BINARY_DIR}/metal-cpp")
+      message(STATUS "Fetching metal-cpp")
+      file(DOWNLOAD
+        "https://developer.apple.com/metal/cpp/files/metal-cpp_macOS12_iOS15.zip"
+        "${CMAKE_BINARY_DIR}/metal-cpp.zip")
+      file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/metal-cpp.zip DESTINATION ${CMAKE_BINARY_DIR})
+    endif()
+
+    if(NOT DEFINED CMAKE_CXX_STANDARD)
+      set(CMAKE_CXX_STANDARD 17)
+    endif()
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+    find_library(METAL_LIBRARY Metal REQUIRED)
+    message(STATUS "Metal framework found at '${METAL_LIBRARY}'")
+
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    message(STATUS "Foundation framework found at '${FOUNDATION_LIBRARY}'")
+
+    find_library(QUARTZ_CORE_LIBRARY QuartzCore REQUIRED)
+    message(STATUS "CoreFoundation framework found at '${QUARTZ_CORE_LIBRARY}'")
+
+    set(NANOARROW_DEVICE_SOURCES_METAL src/nanoarrow/nanoarrow_device_metal.cc)
+    set(NANOARROW_DEVICE_INCLUDE_METAL ${CMAKE_BINARY_DIR}/metal-cpp)
+    set(NANOARROW_DEVICE_LIBS_METAL ${METAL_LIBRARY} ${FOUNDATION_LIBRARY} ${QUARTZ_CORE_LIBRARY})
+    set(NANOARROW_DEVICE_DEFS_METAL "NANOARROW_DEVICE_WITH_METAL")
+  endif()
+
+  if (NANOARROW_DEVICE_WITH_CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    set(NANOARROW_DEVICE_SOURCES_CUDA src/nanoarrow/nanoarrow_device_cuda.c)
+    set(NANOARROW_DEVICE_LIBS_CUDA CUDA::cudart)

Review Comment:
   If we're going to continue using the runtime, may want to use the static library instead: `CUDA::cudart_static`.
   
   Would still recommend using the driver library though.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }

Review Comment:
   Should we handle the situation where someone doesn't want to use the default Metal device? Intel Macs supported multiple graphics devices and presumably we could imagine apple silicon similarly supporting it in the future as well.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));

Review Comment:
   Why do we need to synchronize on the event here? This function ultimately is just responsible for setting the points in the array view from the passed in array, correct?
   
   If so, synchronize guarantees that the data underneath the pointer is synchronized, but doesn't impact the pointers themselves at all.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result = cudaMemcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes,
+                                    cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result = cudaMemcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes,
+                                    cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result = cudaMemcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes,
+                                    cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, src.data.as_uint8, (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, src.data.as_uint8, (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, src.data.as_uint8, (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes,
+                                    cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes,
+                                    cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes,
+                                    cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(dst.data.as_uint8, src.data.as_uint8, dst.size_bytes);
+    return NANOARROW_OK;

Review Comment:
   These aren't safe because CUDA host memory is subject to the asynchronous behavior of GPU execution. Should still use `cudaMemCpy` with `cudaMemcpyHostToHost` as that handles the synchronization



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewCopyInternal(struct ArrowDevice* device_src,
+                                                       struct ArrowArrayView* src,
+                                                       struct ArrowDevice* device_dst,
+                                                       struct ArrowArray* dst) {
+  // Currently no attempt to minimize the amount of meory copied (i.e.,
+  // by applying offset + length and copying potentially fewer bytes)
+  dst->length = src->length;
+  dst->offset = src->offset;
+  dst->null_count = src->null_count;
+
+  struct ArrowDeviceBufferView buffer_view_src;
+  buffer_view_src.offset_bytes = 0;
+
+  for (int i = 0; i < 3; i++) {
+    if (src->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
+      break;
+    }
+
+    buffer_view_src.private_data = src->buffer_views[i].data.data;
+    buffer_view_src.size_bytes = src->buffer_views[i].size_bytes;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferInit(device_src, buffer_view_src, device_dst,
+                                                  ArrowArrayBuffer(dst, i)));
+  }
+
+  for (int64_t i = 0; i < src->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceArrayViewCopyInternal(
+        device_src, src->children[i], device_dst, dst->children[i]));
+  }
+
+  if (src->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceArrayViewCopyInternal(
+        device_src, src->dictionary, device_dst, dst->dictionary));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewCopy(struct ArrowDeviceArrayView* src,
+                                        struct ArrowDevice* device_dst,
+                                        struct ArrowDeviceArray* dst) {
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(&tmp, &src->array_view, NULL));
+
+  int result =
+      ArrowDeviceArrayViewCopyInternal(src->device, &src->array_view, device_dst, &tmp);
+  if (result != NANOARROW_OK) {
+    tmp.release(&tmp);
+    return result;
+  }
+
+  result = ArrowArrayFinishBuilding(&tmp, NANOARROW_VALIDATION_LEVEL_MINIMAL, NULL);
+  if (result != NANOARROW_OK) {
+    tmp.release(&tmp);
+    return result;
+  }
+
+  ArrowDeviceArrayInit(dst, device_dst);
+  ArrowArrayMove(&tmp, &dst->array);
+  dst->device_type = device_dst->device_type;
+  dst->device_id = device_dst->device_id;
+  return result;
+}
+
+int ArrowDeviceArrayViewCopyRequired(struct ArrowDeviceArrayView* src,
+                                     struct ArrowDevice* device_dst) {

Review Comment:
   I think we should add some specification on the expected behavior of this function. I.E. a `CUDA` device could read a `CUDA_HOST` buffer without a copy. Similarly, a `CUDA` device may be able to read a `CUDA` buffer from a different device without copying it.
   
   Neither of these behaviors may be desired though.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewCopyInternal(struct ArrowDevice* device_src,
+                                                       struct ArrowArrayView* src,
+                                                       struct ArrowDevice* device_dst,
+                                                       struct ArrowArray* dst) {
+  // Currently no attempt to minimize the amount of meory copied (i.e.,
+  // by applying offset + length and copying potentially fewer bytes)
+  dst->length = src->length;
+  dst->offset = src->offset;
+  dst->null_count = src->null_count;
+
+  struct ArrowDeviceBufferView buffer_view_src;
+  buffer_view_src.offset_bytes = 0;
+
+  for (int i = 0; i < 3; i++) {
+    if (src->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
+      break;
+    }
+
+    buffer_view_src.private_data = src->buffer_views[i].data.data;
+    buffer_view_src.size_bytes = src->buffer_views[i].size_bytes;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferInit(device_src, buffer_view_src, device_dst,
+                                                  ArrowArrayBuffer(dst, i)));

Review Comment:
   If this buffer initialization is asynchronous then we need to set a synchronization event somewhere I think?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(

Review Comment:
   When building general purposes APIs for accelerators I generally approach with the mindset of "make copies to/from device explicit, not implicit". In this situation, if someone wanted to validate their data, I think a reasonable tradeoff could be that they explicitly copy the data to the CPU device themselves and then call validation against the CPU device.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235553634


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);

Review Comment:
   Quick question here: are the buffer addresses guaranteed to be valid before the event is synchronized? (i.e., do we have to wait on the sync event before doing `array->buffers[i]`, or is it just before accessing the content?)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] codecov-commenter commented on pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "codecov-commenter (via GitHub)" <gi...@apache.org>.

codecov-commenter commented on PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#issuecomment-1564699612

   ## [Codecov](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/205?src=pr&el=h1&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) Report
   > Merging [#205](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/205?src=pr&el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) (7692b50) into [main](https://app.codecov.io/gh/apache/arrow-nanoarrow/commit/d9f428dec04f1a610b1c76b58f454be560a004df?el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) (d9f428d) will **increase** coverage by `0.08%`.
   > The diff coverage is `94.73%`.
   
   ```diff
   @@            Coverage Diff             @@
   ##             main     #205      +/-   ##
   ==========================================
   + Coverage   87.90%   87.99%   +0.08%     
   ==========================================
     Files          60       63       +3     
     Lines        9112     9228     +116     
   ==========================================
   + Hits         8010     8120     +110     
   - Misses       1102     1108       +6     
   ```
   
   
   | [Impacted Files](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/205?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | Coverage Δ | |
   |---|---|---|
   | [.../nanoarrow\_device/src/nanoarrow/nanoarrow\_device.c](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/205?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache#diff-ZXh0ZW5zaW9ucy9uYW5vYXJyb3dfZGV2aWNlL3NyYy9uYW5vYXJyb3cvbmFub2Fycm93X2RldmljZS5j) | `92.50% <92.50%> (ø)` | |
   | [.../nanoarrow\_device/src/nanoarrow/nanoarrow\_device.h](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/205?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache#diff-ZXh0ZW5zaW9ucy9uYW5vYXJyb3dfZGV2aWNlL3NyYy9uYW5vYXJyb3cvbmFub2Fycm93X2RldmljZS5o) | `100.00% <100.00%> (ø)` | |
   | [...anoarrow\_device/src/nanoarrow/nanoarrow\_device.hpp](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/205?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache#diff-ZXh0ZW5zaW9ucy9uYW5vYXJyb3dfZGV2aWNlL3NyYy9uYW5vYXJyb3cvbmFub2Fycm93X2RldmljZS5ocHA=) | `100.00% <100.00%> (ø)` | |
   
   ... and [1 file with indirect coverage changes](https://app.codecov.io/gh/apache/arrow-nanoarrow/pull/205/indirect-changes?src=pr&el=tree-more&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache)
   
   :mega: We’re building smart automated test selection to slash your CI/CD build times. [Learn more](https://about.codecov.io/iterative-testing/?utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] zeroshade commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.

zeroshade commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235501429


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {

Review Comment:
   It would probably be useful for device code to be able to access the value of a specific index of the array without having to perform the copy. It also allows CPU code to find the address of the specific index (pointer into non-cpu memory) that can then be used for whatever is necessary on the device side without needing to copy the value.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235475967


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;

Review Comment:
   I should probably just set `allocator->reallocate = NULL` instead of defining this...this would get called if somebody tried `ArrowBufferAppend()` on one of these, but there really isn't an opportunity to do so except in internal code within the extension and in any case `ArrowBufferAppend()` would crash when it tried to do copy the memory.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226845796


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>

Review Comment:
   That does seem like a better fit (although may require implementing some reference counting of our own).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226854070


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet

Review Comment:
   I don't think there's a technical limitation for making `ArrowDeviceArrayViewCopy()` return after having kicked off all the buffer copies and populating the `ArrowDeviceArray`'s `sync_event`...this is mostly a personal limitation (steep learning curve for me).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226821712


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;

Review Comment:
   An early version of this for Apple Metal couldn't assume that pointer arithmetic could be done (For an metal buffer to be sent to the GPU, it literally has to be an `MTL::Buffer`, and you can't wrap an arbitrary pointer + length unless the pointer and length are page-aligned. My first solution was to make `private_data` an `MTL::Buffer*` rather than a pointer to actual data; my second solution was to page-align all the buffers. Neither of those are ideal but also don't apply to CUDA). I could probably remove this bit of indirection since none of the devices I've implemented use it anymore.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237103440


##########
extensions/nanoarrow_device/CMakeLists.txt:
##########
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+message(STATUS "Building using CMake version: ${CMAKE_VERSION}")
+cmake_minimum_required(VERSION 3.14)
+include(FetchContent)
+
+if(NOT DEFINED CMAKE_C_STANDARD)
+  set(CMAKE_C_STANDARD 11)
+endif()
+
+project(nanoarrow_device)
+
+option(NANOARROW_DEVICE_BUILD_TESTS "Build tests" OFF)
+option(NANOARROW_DEVICE_BUNDLE "Create bundled nanoarrow_device.h and nanoarrow_device.c" OFF)
+option(NANOARROW_DEVICE_WITH_METAL "Build Apple metal extension" OFF)
+option(NANOARROW_DEVICE_WITH_CUDA "Build CUDA extension" OFF)
+
+
+option(NANOARROW_DEVICE_CODE_COVERAGE "Enable coverage reporting" OFF)
+add_library(device_coverage_config INTERFACE)
+
+if (NANOARROW_DEVICE_BUILD_TESTS OR NOT NANOARROW_DEVICE_BUNDLE)
+  # Add the nanoarrow dependency. nanoarrow is not linked into the
+  # nanoarrow_device library (the caller must link this themselves);
+  # however, we need nanoarrow.h to build nanoarrow_device.c.
+  FetchContent_Declare(
+    nanoarrow
+    SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
+
+  # Don't install nanoarrow because of this configuration
+  FetchContent_GetProperties(nanoarrow)
+  if(NOT nanoarrow_POPULATED)
+    FetchContent_Populate(nanoarrow)
+    add_subdirectory(${nanoarrow_SOURCE_DIR} ${nanoarrow_BINARY_DIR} EXCLUDE_FROM_ALL)
+  endif()
+endif()
+
+if (NANOARROW_DEVICE_BUNDLE)
+  # The CMake build step is creating nanoarrow_device.c and nanoarrow_device.h;
+  # the CMake install step is copying them to a specific location
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation)
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation/nanoarrow)
+
+  # nanoarrow_device.h is currently standalone
+  set(NANOARROW_DEVICE_H_TEMP ${CMAKE_BINARY_DIR}/amalgamation/nanoarrow/nanoarrow_device.h)
+  file(READ src/nanoarrow/nanoarrow_device.h SRC_FILE_CONTENTS)
+  file(WRITE ${NANOARROW_DEVICE_H_TEMP} "${SRC_FILE_CONTENTS}")
+
+  # nanoarrow_device.c is currently standalone
+  set(NANOARROW_DEVICE_C_TEMP ${CMAKE_BINARY_DIR}/amalgamation/nanoarrow/nanoarrow_device.c)
+  file(READ src/nanoarrow/nanoarrow_device.c SRC_FILE_CONTENTS)
+  file(WRITE ${NANOARROW_DEVICE_C_TEMP} "${SRC_FILE_CONTENTS}")
+
+  # Add a library that the tests can link against (but don't install it)
+  if(NANOARROW_DEVICE_BUILD_TESTS)
+    add_library(nanoarrow_device ${NANOARROW_DEVICE_C_TEMP})
+
+    target_include_directories(nanoarrow_device PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+      $<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src/nanoarrow>
+      $<BUILD_INTERFACE:${nanoarrow_BINARY_DIR}/generated>
+      $<BUILD_INTERFACE:${NANOARROW_DEVICE_FLATCC_INCLUDE_DIR}>)
+  endif()
+
+  # Install the amalgamated header and sources
+  install(FILES
+    ${NANOARROW_DEVICE_H_TEMP}
+    ${NANOARROW_DEVICE_C_TEMP}
+    DESTINATION ".")
+else()
+  # This is a normal CMake build that builds + installs some includes and a static lib
+  if (NANOARROW_DEVICE_WITH_METAL)
+    if (NOT EXISTS "${CMAKE_BINARY_DIR}/metal-cpp")
+      message(STATUS "Fetching metal-cpp")
+      file(DOWNLOAD
+        "https://developer.apple.com/metal/cpp/files/metal-cpp_macOS12_iOS15.zip"
+        "${CMAKE_BINARY_DIR}/metal-cpp.zip")
+      file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/metal-cpp.zip DESTINATION ${CMAKE_BINARY_DIR})
+    endif()
+
+    if(NOT DEFINED CMAKE_CXX_STANDARD)
+      set(CMAKE_CXX_STANDARD 17)
+    endif()
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+    find_library(METAL_LIBRARY Metal REQUIRED)
+    message(STATUS "Metal framework found at '${METAL_LIBRARY}'")
+
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    message(STATUS "Foundation framework found at '${FOUNDATION_LIBRARY}'")
+
+    find_library(QUARTZ_CORE_LIBRARY QuartzCore REQUIRED)
+    message(STATUS "CoreFoundation framework found at '${QUARTZ_CORE_LIBRARY}'")
+
+    set(NANOARROW_DEVICE_SOURCES_METAL src/nanoarrow/nanoarrow_device_metal.cc)
+    set(NANOARROW_DEVICE_INCLUDE_METAL ${CMAKE_BINARY_DIR}/metal-cpp)
+    set(NANOARROW_DEVICE_LIBS_METAL ${METAL_LIBRARY} ${FOUNDATION_LIBRARY} ${QUARTZ_CORE_LIBRARY})
+    set(NANOARROW_DEVICE_DEFS_METAL "NANOARROW_DEVICE_WITH_METAL")
+  endif()
+
+  if (NANOARROW_DEVICE_WITH_CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    set(NANOARROW_DEVICE_SOURCES_CUDA src/nanoarrow/nanoarrow_device_cuda.c)
+    set(NANOARROW_DEVICE_LIBS_CUDA CUDA::cudart)

Review Comment:
   Driver library is a definite yes (just haven't gotten there yet).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#issuecomment-1603076375

   Ok! There are definitely some holes in this implementation (notably around properly synchronizing memory copies). I'd propose that this PR get merged (and clearly marked as under development/experimental in the README) with some related changes.
   
   - Switch to using the driver library: https://github.com/apache/arrow-nanoarrow/issues/246
   - Properly order copy operations when copying to/from device: https://github.com/apache/arrow-nanoarrow/issues/245
   - Expand test coverage to more than just string arrays: https://github.com/apache/arrow-nanoarrow/issues/247
   
   I think it's still an open question as to whether or not this particular extension will be used/will be useful...if there is no interest in using it before the next release it can always be excluded from the source release (like the Python bindings currently are) or moved back to a PR state.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237070212


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);

Review Comment:
   Yes, this function is intended to do the bad thing and block until it's safe to do CPU things. If there's a way to avoid the sync before copying back to the CPU it could be removed.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);
+
+  if (result != cudaSuccess) {
+    ArrowErrorSet(error, "cudaEventSynchronize() failed: %s", cudaGetErrorString(result));
+    return EINVAL;
+  }
+
+  cudaEventDestroy(*cuda_event);

Review Comment:
   Done!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1238822887


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,

Review Comment:
   I think this is handled by `array_view->buffer_views[i].data.as_int32 + some_index` (which would get you the pointer to an element of a buffer).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235485150


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;
+
+  /// \brief The size of the buffer in bytes
+  int64_t size_bytes;
+};
+
+/// \brief A Device wrapper with callbacks for basic memory management tasks
+///
+/// All device objects are currently implemented as singletons; however, this
+/// may change as implementations progress.
+struct ArrowDevice {
+  /// \brief The device type integer identifier (see ArrowDeviceArray)
+  ArrowDeviceType device_type;
+
+  /// \brief The device identifier (see ArrowDeviceArray)
+  int64_t device_id;
+
+  /// \brief Initialize an owning buffer from existing content
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// copying existing content.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_init)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Move an owning buffer to a device
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// moving an existing buffer. If NANOARROW_OK is returned, src will have
+  /// been released or moved by the implementation and dst must be released by
+  /// the caller.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_move)(struct ArrowDevice* device_src, struct ArrowBuffer* src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Copy a section of memory into a preallocated buffer
+  ///
+  /// As opposed to the other buffer operations, this is designed to support
+  /// copying very small slices of memory.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_copy)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst,
+                                struct ArrowDeviceBufferView dst);
+
+  /// \brief Check if a copy is required to move between devices
+  ///
+  /// Returns 1 (copy is required), 0 (copy not required; move is OK), or -1 (don't know)
+  int (*copy_required)(struct ArrowDevice* device_src, struct ArrowArrayView* src,
+                       struct ArrowDevice* device_dst);
+
+  /// \brief Wait for an event
+  ///
+  /// Implementations should handle at least waiting on the CPU host.
+  /// Implementations do not have to handle a NULL sync_event.
+  ArrowErrorCode (*synchronize_event)(struct ArrowDevice* device,
+                                      struct ArrowDevice* device_event, void* sync_event,
+                                      struct ArrowError* error);

Review Comment:
   This is a hangover from an early idea that maybe a device wouldn't have to wait on itself and should almost certainly be removed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235472731


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;

Review Comment:
   Absolutely 😬 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] zeroshade commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.

zeroshade commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1235737155


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);

Review Comment:
   @kkraus14 would be able to better answer it, but as far as I'm aware the addresses are valid. It's basically just a race condition on whether the contents have been updated or not. Sync'ing on the event ensures that the contents of the memory addresses are up to date.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] kkraus14 commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "kkraus14 (via GitHub)" <gi...@apache.org>.

kkraus14 commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1224737589


##########
extensions/nanoarrow_device/CMakeLists.txt:
##########
@@ -0,0 +1,221 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+message(STATUS "Building using CMake version: ${CMAKE_VERSION}")
+cmake_minimum_required(VERSION 3.14)
+include(FetchContent)
+
+if(NOT DEFINED CMAKE_C_STANDARD)
+  set(CMAKE_C_STANDARD 11)
+endif()
+
+project(nanoarrow_device)
+
+option(NANOARROW_DEVICE_BUILD_TESTS "Build tests" OFF)
+option(NANOARROW_DEVICE_BUNDLE "Create bundled nanoarrow_device.h and nanoarrow_device.c" OFF)
+option(NANOARROW_DEVICE_WITH_METAL "Build Apple metal extension" OFF)
+option(NANOARROW_DEVICE_WITH_METAL "Build CUDA extension" OFF)

Review Comment:
   ```suggestion
   option(NANOARROW_DEVICE_WITH_CUDA "Build CUDA extension" OFF)
   ```



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;

Review Comment:
   Why is an offset needed if pointer arithmetic can be done and it's non-owning?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>

Review Comment:
   We'd likely be better off using the CUDA driver API here instead of the runtime API as there's much stronger forward compatibility guarantees as well as easier deployment (someone can have the driver installed but not the CUDA runtime, but not the reverse).



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet

Review Comment:
   For what it's worth: this will likely be a blocker for most libraries / frameworks to be able to utilize things.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}

Review Comment:
   Most GPU libraries / frameworks have their own memory pool / memory management implementations that are often asynchronous (and are ordered by CUDA streams) where they won't be able to benefit from this implementation. This is generally true for most operations: free, alloc, realloc, memset, memcpy, etc.
   
   I'm not sure if we need an actual implementation to live within nanoarrow or if we can just define an interface for downstream libraries to implement.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);
+
+  if (result != cudaSuccess) {
+    ArrowErrorSet(error, "cudaEventSynchronize() failed: %s", cudaGetErrorString(result));
+    return EINVAL;
+  }
+
+  cudaEventDestroy(*cuda_event);

Review Comment:
   I believe the release callback on the ArrowDeviceArray is responsible for cleaning up the event



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);

Review Comment:
   In most situations you'd want to use `cudaStreamWaitEvent` as opposed to this API as its much more efficient and doesn't unnecessarily block the CPU until things are done.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;

Review Comment:
   Should we handle the situations where the src is `ARROW_DEVICE_CUDA_HOST` and dst is `ARROW_DEVICE_CUDA` and vice versa?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;

Review Comment:
   nitpick but it's a bit easy to mix this up with the higher level `ArrowDeviceArrayStream.private_data` when going through code



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226832653


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}

Review Comment:
   An early version of `ArrowDeviceBufferXXX()` functions had a `sync_event*` argument, which I removed before I saw `cudaMemcpyAsync` and friends in the documentation. I don't know if that's the perfect interface, but the part of nanoarrow's generic "copy this array to the device" implementation would benefit a lot since essentially all of those buffers can be copied in parallel.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226835502


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);

Review Comment:
   I assume that is what a library doing gpu--gpu calculations would do...here I think it does need to be the blocking version (this is the method that is called before an arbitrary `ArrowDeviceArray` or a slice of it is copied back to the CPU).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226860742


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;

Review Comment:
   I did see `cuMemHostGetDevicePointer()`, so I assume this is possible. I think it would require that `ArrowDeviceArrayViewCopy()` has a device-specific implementation (probably for the best anyway).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] zeroshade commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "zeroshade (via GitHub)" <gi...@apache.org>.

zeroshade commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1228535899


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for

Review Comment:
   should we link directly to the device interface docs? (https://arrow.apache.org/docs/dev/format/CDeviceDataInterface.html)



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;
+
+  /// \brief The size of the buffer in bytes
+  int64_t size_bytes;
+};
+
+/// \brief A Device wrapper with callbacks for basic memory management tasks
+///
+/// All device objects are currently implemented as singletons; however, this
+/// may change as implementations progress.
+struct ArrowDevice {
+  /// \brief The device type integer identifier (see ArrowDeviceArray)
+  ArrowDeviceType device_type;
+
+  /// \brief The device identifier (see ArrowDeviceArray)
+  int64_t device_id;
+
+  /// \brief Initialize an owning buffer from existing content
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// copying existing content.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_init)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Move an owning buffer to a device
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// moving an existing buffer. If NANOARROW_OK is returned, src will have
+  /// been released or moved by the implementation and dst must be released by
+  /// the caller.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_move)(struct ArrowDevice* device_src, struct ArrowBuffer* src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Copy a section of memory into a preallocated buffer
+  ///
+  /// As opposed to the other buffer operations, this is designed to support
+  /// copying very small slices of memory.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_copy)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst,
+                                struct ArrowDeviceBufferView dst);

Review Comment:
   same question, should we put specifics as to the semantics of the device types? Should implementations have to check the device type every time for both source and destination or have to implement multiple devices?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;
+
+  /// \brief The size of the buffer in bytes
+  int64_t size_bytes;
+};
+
+/// \brief A Device wrapper with callbacks for basic memory management tasks
+///
+/// All device objects are currently implemented as singletons; however, this
+/// may change as implementations progress.
+struct ArrowDevice {
+  /// \brief The device type integer identifier (see ArrowDeviceArray)
+  ArrowDeviceType device_type;
+
+  /// \brief The device identifier (see ArrowDeviceArray)
+  int64_t device_id;
+
+  /// \brief Initialize an owning buffer from existing content
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// copying existing content.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_init)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Move an owning buffer to a device
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// moving an existing buffer. If NANOARROW_OK is returned, src will have
+  /// been released or moved by the implementation and dst must be released by
+  /// the caller.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_move)(struct ArrowDevice* device_src, struct ArrowBuffer* src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);

Review Comment:
   Should we specify semantics about the src and dst devices such as the src being CPU memory? etc.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {

Review Comment:
   `device` isn't used here, only `device_event`, why the need for both?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {

Review Comment:
   Should there be an equivalent that *doesn't* do the copy?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;

Review Comment:
   why set it to null?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {

Review Comment:
   Update with the updated briefer version of the documentation which leaves the detailed explanations for the official spec in prose rather than in the comments?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(

Review Comment:
   Should include a comment that states that this will copy data from the device to the CPU in order to perform the validation. This means that there might be a need for synchronization to happen before this can be called.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;
+
+  /// \brief The size of the buffer in bytes
+  int64_t size_bytes;
+};
+
+/// \brief A Device wrapper with callbacks for basic memory management tasks
+///
+/// All device objects are currently implemented as singletons; however, this
+/// may change as implementations progress.
+struct ArrowDevice {
+  /// \brief The device type integer identifier (see ArrowDeviceArray)
+  ArrowDeviceType device_type;
+
+  /// \brief The device identifier (see ArrowDeviceArray)
+  int64_t device_id;
+
+  /// \brief Initialize an owning buffer from existing content
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// copying existing content.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_init)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Move an owning buffer to a device
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// moving an existing buffer. If NANOARROW_OK is returned, src will have
+  /// been released or moved by the implementation and dst must be released by
+  /// the caller.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_move)(struct ArrowDevice* device_src, struct ArrowBuffer* src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Copy a section of memory into a preallocated buffer
+  ///
+  /// As opposed to the other buffer operations, this is designed to support
+  /// copying very small slices of memory.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_copy)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst,
+                                struct ArrowDeviceBufferView dst);
+
+  /// \brief Check if a copy is required to move between devices
+  ///
+  /// Returns 1 (copy is required), 0 (copy not required; move is OK), or -1 (don't know)
+  int (*copy_required)(struct ArrowDevice* device_src, struct ArrowArrayView* src,
+                       struct ArrowDevice* device_dst);
+
+  /// \brief Wait for an event
+  ///
+  /// Implementations should handle at least waiting on the CPU host.
+  /// Implementations do not have to handle a NULL sync_event.
+  ArrowErrorCode (*synchronize_event)(struct ArrowDevice* device,
+                                      struct ArrowDevice* device_event, void* sync_event,
+                                      struct ArrowError* error);

Review Comment:
   why is the event itself an `ArrowDevice`?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_metal.cc:
##########
@@ -0,0 +1,331 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#define NS_PRIVATE_IMPLEMENTATION
+#define MTL_PRIVATE_IMPLEMENTATION
+#include <Metal/Metal.hpp>
+
+#include "nanoarrow_device.hpp"
+
+#include "nanoarrow_device_metal.h"
+
+// If non-null, caller must ->release() the return value. This doesn't
+// release the underlying memory (which must be managed separately).
+static MTL::Buffer* ArrowDeviceMetalWrapBufferNonOwning(MTL::Device* mtl_device,
+                                                        const void* arbitrary_addr,
+                                                        int64_t size_bytes) {
+  // We can wrap any zero-size buffer
+  if (size_bytes == 0) {
+    return mtl_device->newBuffer(0, MTL::ResourceStorageModeShared);
+  }
+
+  // Cache the page size from the system call
+  static int pagesize = 0;
+  if (pagesize == 0) {
+    pagesize = getpagesize();
+  }
+
+  int64_t allocation_size;
+  if (size_bytes % pagesize == 0) {
+    allocation_size = size_bytes;
+  } else {
+    allocation_size = (size_bytes / pagesize) + 1 * pagesize;
+  }
+
+  // Will return nullptr if the memory is improperly aligned
+  return mtl_device->newBuffer(arbitrary_addr, allocation_size,
+                               MTL::ResourceStorageModeShared, nullptr);
+}
+
+static uint8_t* ArrowDeviceMetalAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  // Cache the page size from the system call
+  static int pagesize = 0;
+  if (pagesize == 0) {
+    pagesize = getpagesize();
+  }
+
+  int64_t allocation_size;
+  if (new_size % pagesize == 0) {
+    allocation_size = new_size;
+  } else {
+    allocation_size = (new_size / pagesize) + 1 * pagesize;
+  }
+
+  // If growing an existing buffer but the allocation size is still big enough,
+  // return the same pointer and do nothing.
+  if (ptr != nullptr && new_size >= old_size && new_size <= allocation_size) {
+    return ptr;
+  }
+
+  int64_t copy_size;
+  if (new_size > old_size) {
+    copy_size = old_size;
+  } else {
+    copy_size = new_size;
+  }
+
+  void* new_ptr = nullptr;
+  posix_memalign(&new_ptr, pagesize, allocation_size);
+  if (new_ptr != nullptr && ptr != nullptr) {
+    memcpy(new_ptr, ptr, copy_size);
+  }
+
+  if (ptr != nullptr) {
+    free(ptr);
+  }
+
+  return reinterpret_cast<uint8_t*>(new_ptr);
+}
+
+static void ArrowDeviceMetalAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                          uint8_t* ptr, int64_t old_size) {
+  free(ptr);
+}
+
+void ArrowDeviceMetalInitBuffer(struct ArrowBuffer* buffer) {
+  buffer->allocator.reallocate = &ArrowDeviceMetalAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceMetalAllocatorFree;
+  buffer->allocator.private_data = nullptr;
+  buffer->data = nullptr;

Review Comment:
   device_id? same as in the cuda code, we should have a `TODO` for getting the device id represented here along with possibly the device itself?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;

Review Comment:
   `Reallocate` just calls free? Shouldn't this allocate a new thing and then copy?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;

Review Comment:
   Allocator should probably be able to tell what device it's allocating on, right?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);
+
+  if (result != cudaSuccess) {
+    ArrowErrorSet(error, "cudaEventSynchronize() failed: %s", cudaGetErrorString(result));
+    return EINVAL;
+  }
+
+  cudaEventDestroy(*cuda_event);

Review Comment:
   @kkraus14 is correct. The release callback on `ArrowDeviceArray` should clean up the event it contains.



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);

Review Comment:
   can the validation be optional? If so, can the synchronize on the event only happen if they are doing the validation?



##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);

Review Comment:
   To call `cudaStreamWaitEvent` you'd need to know what stream to wait on.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1226839319


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {
+  if (sync_event == NULL) {
+    return NANOARROW_OK;
+  }
+
+  if (device_event->device_type != ARROW_DEVICE_CUDA ||
+      device_event->device_type != ARROW_DEVICE_CUDA_HOST) {
+    return ENOTSUP;
+  }
+
+  // Pointer vs. not pointer...is there memory ownership to consider here?
+  cudaEvent_t* cuda_event = (cudaEvent_t*)sync_event;
+  cudaError_t result = cudaEventSynchronize(*cuda_event);
+
+  if (result != cudaSuccess) {
+    ArrowErrorSet(error, "cudaEventSynchronize() failed: %s", cudaGetErrorString(result));
+    return EINVAL;
+  }
+
+  cudaEventDestroy(*cuda_event);

Review Comment:
   I'll have to rework a few things to accommodate that but it does make sense.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237099144


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;

Review Comment:
   Ok, this is handled now: `sync_event` always points to a valid `cudaEvent_t` for a (valid) CUDA device array.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237003090


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.h:
##########
@@ -0,0 +1,439 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef NANOARROW_DEVICE_H_INCLUDED
+#define NANOARROW_DEVICE_H_INCLUDED
+
+#include "nanoarrow.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// \defgroup nanoarrow_device-arrow-cdata Arrow C Device interface
+///
+/// The Arrow Device and Stream interfaces are part of the
+/// Arrow Columnar Format specification
+/// (https://arrow.apache.org/docs/format/Columnar.html). See the Arrow documentation for
+/// detailed documentation of these structures.
+///
+/// @{
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure
+/// compatibility, rather than potentially diverging.
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCM_HOST 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
+struct ArrowDeviceArray {
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP).
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: sycl::event*
+  ///
+  void* sync_event;
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
+  /// the future.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
+  ArrowDeviceType device_type;
+
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  /// \brief Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+/// \brief Move the contents of src into dst and set src->array.release to NULL
+static inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                        struct ArrowDeviceArray* dst) {
+  memcpy(dst, src, sizeof(struct ArrowDeviceArray));
+  src->array.release = 0;
+}
+
+/// @}
+
+#ifdef NANOARROW_NAMESPACE
+
+#define ArrowDeviceCheckRuntime \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCheckRuntime)
+#define ArrowDeviceArrayInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayInit)
+#define ArrowDeviceArrayViewInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewInit)
+#define ArrowDeviceArrayViewReset \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewReset)
+#define ArrowDeviceArrayViewSetArray \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewSetArray)
+#define ArrowDeviceArrayViewCopy \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopy)
+#define ArrowDeviceArrayViewCopyRequired \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayViewCopyRequired)
+#define ArrowDeviceArrayTryMove \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceArrayTryMove)
+#define ArrowDeviceResolve NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceResolve)
+#define ArrowDeviceCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceCpu)
+#define ArrowDeviceInitCpu NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceInitCpu)
+#define ArrowDeviceBufferInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferInit)
+#define ArrowDeviceBufferMove NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferMove)
+#define ArrowDeviceBufferCopy NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBufferCopy)
+#define ArrowDeviceBasicArrayStreamInit \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDeviceBasicArrayStreamInit)
+
+#endif
+
+/// \defgroup nanoarrow_device Nanoarrow Device extension
+///
+/// Except where noted, objects are not thread-safe and clients should
+/// take care to serialize accesses to methods.
+///
+/// @{
+
+/// \brief Checks the nanoarrow runtime to make sure the run/build versions match
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error);
+
+/// \brief A description of a buffer
+struct ArrowDeviceBufferView {
+  /// \brief Device-defined handle for a buffer.
+  ///
+  /// For the CPU device, this is a normal memory address; for all other types that are
+  /// currently supported, this is a device memory address on which CPU-like arithmetic
+  /// can be performed. This may not be true for future devices (i.e., it may be a pointer
+  /// to some buffer abstraction if the concept of a memory address does not exist or
+  /// is impractical).
+  const void* private_data;
+
+  /// \brief An offset into the buffer handle defined by private_data
+  int64_t offset_bytes;
+
+  /// \brief The size of the buffer in bytes
+  int64_t size_bytes;
+};
+
+/// \brief A Device wrapper with callbacks for basic memory management tasks
+///
+/// All device objects are currently implemented as singletons; however, this
+/// may change as implementations progress.
+struct ArrowDevice {
+  /// \brief The device type integer identifier (see ArrowDeviceArray)
+  ArrowDeviceType device_type;
+
+  /// \brief The device identifier (see ArrowDeviceArray)
+  int64_t device_id;
+
+  /// \brief Initialize an owning buffer from existing content
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// copying existing content.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_init)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Move an owning buffer to a device
+  ///
+  /// Creates a new buffer whose data member can be accessed by the GPU by
+  /// moving an existing buffer. If NANOARROW_OK is returned, src will have
+  /// been released or moved by the implementation and dst must be released by
+  /// the caller.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_move)(struct ArrowDevice* device_src, struct ArrowBuffer* src,
+                                struct ArrowDevice* device_dst, struct ArrowBuffer* dst);
+
+  /// \brief Copy a section of memory into a preallocated buffer
+  ///
+  /// As opposed to the other buffer operations, this is designed to support
+  /// copying very small slices of memory.
+  /// Implementations must check device_src and device_dst and return ENOTSUP if
+  /// not prepared to handle this operation.
+  ArrowErrorCode (*buffer_copy)(struct ArrowDevice* device_src,
+                                struct ArrowDeviceBufferView src,
+                                struct ArrowDevice* device_dst,
+                                struct ArrowDeviceBufferView dst);
+
+  /// \brief Check if a copy is required to move between devices
+  ///
+  /// Returns 1 (copy is required), 0 (copy not required; move is OK), or -1 (don't know)
+  int (*copy_required)(struct ArrowDevice* device_src, struct ArrowArrayView* src,
+                       struct ArrowDevice* device_dst);
+
+  /// \brief Wait for an event
+  ///
+  /// Implementations should handle at least waiting on the CPU host.
+  /// Implementations do not have to handle a NULL sync_event.
+  ArrowErrorCode (*synchronize_event)(struct ArrowDevice* device,
+                                      struct ArrowDevice* device_event, void* sync_event,
+                                      struct ArrowError* error);

Review Comment:
   Done!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237074185


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device_cuda.c:
##########
@@ -0,0 +1,376 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cuda_runtime_api.h>
+
+#include "nanoarrow_device.h"
+
+static void ArrowDeviceCudaAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                         uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFree(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaAllocatorReallocate(struct ArrowBufferAllocator* allocator,
+                                                   uint8_t* ptr, int64_t old_size,
+                                                   int64_t new_size) {
+  ArrowDeviceCudaAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaAllocateBuffer(struct ArrowBuffer* buffer,
+                                                    int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMalloc(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+static void ArrowDeviceCudaHostAllocatorFree(struct ArrowBufferAllocator* allocator,
+                                             uint8_t* ptr, int64_t old_size) {
+  if (ptr != NULL) {
+    cudaFreeHost(ptr);
+  }
+}
+
+static uint8_t* ArrowDeviceCudaHostAllocatorReallocate(
+    struct ArrowBufferAllocator* allocator, uint8_t* ptr, int64_t old_size,
+    int64_t new_size) {
+  ArrowDeviceCudaHostAllocatorFree(allocator, ptr, old_size);
+  return NULL;
+}
+
+static ArrowErrorCode ArrowDeviceCudaHostAllocateBuffer(struct ArrowBuffer* buffer,
+                                                        int64_t size_bytes) {
+  void* ptr = NULL;
+  cudaError_t result = cudaMallocHost(&ptr, (int64_t)size_bytes);
+  if (result != cudaSuccess) {
+    return EINVAL;
+  }
+
+  buffer->data = (uint8_t*)ptr;
+  buffer->size_bytes = size_bytes;
+  buffer->capacity_bytes = size_bytes;
+  buffer->allocator.reallocate = &ArrowDeviceCudaHostAllocatorReallocate;
+  buffer->allocator.free = &ArrowDeviceCudaHostAllocatorFree;
+  // TODO: We almost certainly need device_id here
+  buffer->allocator.private_data = NULL;
+  return NANOARROW_OK;
+}
+
+// TODO: All these buffer copiers would benefit from cudaMemcpyAsync but there is
+// no good way to incorporate that just yet
+
+static ArrowErrorCode ArrowDeviceCudaBufferInit(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowBuffer* dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    struct ArrowBuffer tmp;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaAllocateBuffer(&tmp, src.size_bytes));
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    cudaError_t result =
+        cudaMemcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+                   (size_t)src.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      ArrowBufferReset(&tmp);
+      return EINVAL;
+    }
+
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceCudaHostAllocateBuffer(dst, src.size_bytes));
+    memcpy(dst->data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    struct ArrowBuffer tmp;
+    ArrowBufferInit(&tmp);
+    NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(&tmp, src.size_bytes));
+    tmp.size_bytes = src.size_bytes;
+    memcpy(tmp.data, ((uint8_t*)src.private_data) + src.offset_bytes,
+           (size_t)src.size_bytes);
+    ArrowBufferMove(&tmp, dst);
+    return NANOARROW_OK;
+
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaBufferCopy(struct ArrowDevice* device_src,
+                                                struct ArrowDeviceBufferView src,
+                                                struct ArrowDevice* device_dst,
+                                                struct ArrowDeviceBufferView dst) {
+  // This is all just cudaMemcpy or memcpy
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyHostToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToDevice);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    cudaError_t result = cudaMemcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+                                    ((uint8_t*)src.private_data) + src.offset_bytes,
+                                    dst.size_bytes, cudaMemcpyDeviceToHost);
+    if (result != cudaSuccess) {
+      return EINVAL;
+    }
+    return NANOARROW_OK;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+           ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+    return NANOARROW_OK;
+  } else {
+    return ENOTSUP;
+  }
+}
+
+static int ArrowDeviceCudaCopyRequired(struct ArrowDevice* device_src,
+                                       struct ArrowArrayView* src,
+                                       struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CUDA) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CUDA &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Copy
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CPU &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST) {
+    // Copy: we can't assume the memory has been registered. A user can force
+    // this by registering the memory and setting device->device_type manually.
+    // A copy will ensure all buffers are allocated with cudaMallocHost().
+    return 1;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_src->device_id == device_dst->device_id) {
+    // Move
+    return 0;
+
+  } else if (device_src->device_type == ARROW_DEVICE_CUDA_HOST &&
+             device_dst->device_type == ARROW_DEVICE_CPU) {
+    // Move: the array's release callback is responsible for cudaFreeHost or
+    // deregistration (or perhaps this has been handled at a higher level)
+    return 0;
+
+  } else {
+    // Fall back to the other device's implementation
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCudaSynchronize(struct ArrowDevice* device,
+                                                 struct ArrowDevice* device_event,
+                                                 void* sync_event,
+                                                 struct ArrowError* error) {

Review Comment:
   Removed!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] paleolimbot commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "paleolimbot (via GitHub)" <gi...@apache.org>.

paleolimbot commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237508451


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));
+  device_array->sync_event = NULL;
+
+  // Set the device array device
+  device_array_view->device = device;
+
+  // nanoarrow's minimal validation is fine here (sets buffer sizes for non offset-buffer
+  // types and errors for invalid ones)
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayMinimal(&device_array_view->array_view,
+                                                        &device_array->array, error));
+  // Run custom validator that copies memory to the CPU where required.
+  // The custom implementation doesn't set nice error messages yet.
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowDeviceArrayViewValidateDefault(device, &device_array_view->array_view), error);
+
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewCopyInternal(struct ArrowDevice* device_src,
+                                                       struct ArrowArrayView* src,
+                                                       struct ArrowDevice* device_dst,
+                                                       struct ArrowArray* dst) {
+  // Currently no attempt to minimize the amount of meory copied (i.e.,
+  // by applying offset + length and copying potentially fewer bytes)
+  dst->length = src->length;
+  dst->offset = src->offset;
+  dst->null_count = src->null_count;
+
+  struct ArrowDeviceBufferView buffer_view_src;
+  buffer_view_src.offset_bytes = 0;
+
+  for (int i = 0; i < 3; i++) {
+    if (src->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
+      break;
+    }
+
+    buffer_view_src.private_data = src->buffer_views[i].data.data;
+    buffer_view_src.size_bytes = src->buffer_views[i].size_bytes;
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferInit(device_src, buffer_view_src, device_dst,
+                                                  ArrowArrayBuffer(dst, i)));
+  }
+
+  for (int64_t i = 0; i < src->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceArrayViewCopyInternal(
+        device_src, src->children[i], device_dst, dst->children[i]));
+  }
+
+  if (src->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(ArrowDeviceArrayViewCopyInternal(
+        device_src, src->dictionary, device_dst, dst->dictionary));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewCopy(struct ArrowDeviceArrayView* src,
+                                        struct ArrowDevice* device_dst,
+                                        struct ArrowDeviceArray* dst) {
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(&tmp, &src->array_view, NULL));
+
+  int result =
+      ArrowDeviceArrayViewCopyInternal(src->device, &src->array_view, device_dst, &tmp);
+  if (result != NANOARROW_OK) {
+    tmp.release(&tmp);
+    return result;
+  }
+
+  result = ArrowArrayFinishBuilding(&tmp, NANOARROW_VALIDATION_LEVEL_MINIMAL, NULL);
+  if (result != NANOARROW_OK) {
+    tmp.release(&tmp);
+    return result;
+  }
+
+  ArrowDeviceArrayInit(dst, device_dst);
+  ArrowArrayMove(&tmp, &dst->array);
+  dst->device_type = device_dst->device_type;
+  dst->device_id = device_dst->device_id;
+  return result;
+}
+
+int ArrowDeviceArrayViewCopyRequired(struct ArrowDeviceArrayView* src,
+                                     struct ArrowDevice* device_dst) {

Review Comment:
   I just removed it...it's a good point about making all copies explicit. Now there is `ArrowDeviceArrayViewCopy()` and `ArrowDeviceArrayMoveToDevice()` and the user picks/handles errors appropriately.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow-nanoarrow] kkraus14 commented on a diff in pull request #205: feat(extensions/nanoarrow_device): Draft DeviceArray interface

Posted by "kkraus14 (via GitHub)" <gi...@apache.org>.

kkraus14 commented on code in PR #205:
URL: https://github.com/apache/arrow-nanoarrow/pull/205#discussion_r1237476219


##########
extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c:
##########
@@ -0,0 +1,518 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <errno.h>
+
+#include "nanoarrow.h"
+
+#include "nanoarrow_device.h"
+
+ArrowErrorCode ArrowDeviceCheckRuntime(struct ArrowError* error) {
+  const char* nanoarrow_runtime_version = ArrowNanoarrowVersion();
+  const char* nanoarrow_ipc_build_time_version = NANOARROW_VERSION;
+
+  if (strcmp(nanoarrow_runtime_version, nanoarrow_ipc_build_time_version) != 0) {
+    ArrowErrorSet(error, "Expected nanoarrow runtime version '%s' but found version '%s'",
+                  nanoarrow_ipc_build_time_version, nanoarrow_runtime_version);
+    return EINVAL;
+  }
+
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayInit(struct ArrowDeviceArray* device_array,
+                          struct ArrowDevice* device) {
+  memset(device_array, 0, sizeof(struct ArrowDeviceArray));
+  device_array->device_type = device->device_type;
+  device_array->device_id = device->device_id;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferInit(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferInit(dst);
+  dst->allocator = ArrowBufferAllocatorDefault();
+  NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(
+      dst, ((uint8_t*)src.private_data) + src.offset_bytes, src.size_bytes));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferMove(struct ArrowDevice* device_src,
+                                               struct ArrowBuffer* src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowBuffer* dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  ArrowBufferMove(src, dst);
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceCpuBufferCopy(struct ArrowDevice* device_src,
+                                               struct ArrowDeviceBufferView src,
+                                               struct ArrowDevice* device_dst,
+                                               struct ArrowDeviceBufferView dst) {
+  if (device_dst->device_type != ARROW_DEVICE_CPU ||
+      device_src->device_type != ARROW_DEVICE_CPU) {
+    return ENOTSUP;
+  }
+
+  memcpy(((uint8_t*)dst.private_data) + dst.offset_bytes,
+         ((uint8_t*)src.private_data) + src.offset_bytes, dst.size_bytes);
+  return NANOARROW_OK;
+}
+
+static int ArrowDeviceCpuCopyRequired(struct ArrowDevice* device_src,
+                                      struct ArrowArrayView* src,
+                                      struct ArrowDevice* device_dst) {
+  if (device_src->device_type == ARROW_DEVICE_CPU &&
+      device_dst->device_type == ARROW_DEVICE_CPU) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static ArrowErrorCode ArrowDeviceCpuSynchronize(struct ArrowDevice* device,
+                                                struct ArrowDevice* device_event,
+                                                void* sync_event,
+                                                struct ArrowError* error) {
+  switch (device_event->device_type) {
+    case ARROW_DEVICE_CPU:
+      if (sync_event != NULL) {
+        ArrowErrorSet(error, "Expected NULL sync_event for ARROW_DEVICE_CPU but got %p",
+                      sync_event);
+        return EINVAL;
+      } else {
+        return NANOARROW_OK;
+      }
+    default:
+      return device_event->synchronize_event(device_event, device, sync_event, error);
+  }
+}
+
+static void ArrowDeviceCpuRelease(struct ArrowDevice* device) { device->release = NULL; }
+
+struct ArrowDevice* ArrowDeviceCpu(void) {
+  static struct ArrowDevice* cpu_device_singleton = NULL;
+  if (cpu_device_singleton == NULL) {
+    cpu_device_singleton = (struct ArrowDevice*)ArrowMalloc(sizeof(struct ArrowDevice));
+    ArrowDeviceInitCpu(cpu_device_singleton);
+  }
+
+  return cpu_device_singleton;
+}
+
+void ArrowDeviceInitCpu(struct ArrowDevice* device) {
+  device->device_type = ARROW_DEVICE_CPU;
+  device->device_id = 0;
+  device->buffer_init = &ArrowDeviceCpuBufferInit;
+  device->buffer_move = &ArrowDeviceCpuBufferMove;
+  device->buffer_copy = &ArrowDeviceCpuBufferCopy;
+  device->copy_required = &ArrowDeviceCpuCopyRequired;
+  device->synchronize_event = &ArrowDeviceCpuSynchronize;
+  device->release = &ArrowDeviceCpuRelease;
+  device->private_data = NULL;
+}
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+struct ArrowDevice* ArrowDeviceMetalDefaultDevice(void);
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+struct ArrowDevice* ArrowDeviceCuda(ArrowDeviceType device_type, int64_t device_id);
+#endif
+
+struct ArrowDevice* ArrowDeviceResolve(ArrowDeviceType device_type, int64_t device_id) {
+  if (device_type == ARROW_DEVICE_CPU && device_id == 0) {
+    return ArrowDeviceCpu();
+  }
+
+#ifdef NANOARROW_DEVICE_WITH_METAL
+  if (device_type == ARROW_DEVICE_METAL) {
+    struct ArrowDevice* default_device = ArrowDeviceMetalDefaultDevice();
+    if (device_id == default_device->device_id) {
+      return default_device;
+    }
+  }
+#endif
+
+#ifdef NANOARROW_DEVICE_WITH_CUDA
+  if (device_type == ARROW_DEVICE_CUDA || device_type == ARROW_DEVICE_CUDA_HOST) {
+    return ArrowDeviceCuda(device_type, device_id);
+  }
+#endif
+
+  return NULL;
+}
+
+ArrowErrorCode ArrowDeviceBufferInit(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_init(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_init(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferMove(struct ArrowDevice* device_src,
+                                     struct ArrowBuffer* src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowBuffer* dst) {
+  int result = device_dst->buffer_move(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_move(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+ArrowErrorCode ArrowDeviceBufferCopy(struct ArrowDevice* device_src,
+                                     struct ArrowDeviceBufferView src,
+                                     struct ArrowDevice* device_dst,
+                                     struct ArrowDeviceBufferView dst) {
+  int result = device_dst->buffer_copy(device_src, src, device_dst, dst);
+  if (result == ENOTSUP) {
+    result = device_src->buffer_copy(device_src, src, device_dst, dst);
+  }
+
+  return result;
+}
+
+struct ArrowBasicDeviceArrayStreamPrivate {
+  struct ArrowDevice* device;
+  struct ArrowArrayStream naive_stream;
+};
+
+static int ArrowDeviceBasicArrayStreamGetSchema(
+    struct ArrowDeviceArrayStream* array_stream, struct ArrowSchema* schema) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_schema(&private_data->naive_stream, schema);
+}
+
+static int ArrowDeviceBasicArrayStreamGetNext(struct ArrowDeviceArrayStream* array_stream,
+                                              struct ArrowDeviceArray* device_array) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+
+  struct ArrowArray tmp;
+  NANOARROW_RETURN_NOT_OK(
+      private_data->naive_stream.get_next(&private_data->naive_stream, &tmp));
+  ArrowDeviceArrayInit(device_array, private_data->device);
+  ArrowArrayMove(&tmp, &device_array->array);
+  return NANOARROW_OK;
+}
+
+static const char* ArrowDeviceBasicArrayStreamGetLastError(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  return private_data->naive_stream.get_last_error(&private_data->naive_stream);
+}
+
+static void ArrowDeviceBasicArrayStreamRelease(
+    struct ArrowDeviceArrayStream* array_stream) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)array_stream->private_data;
+  private_data->naive_stream.release(&private_data->naive_stream);
+  ArrowFree(private_data);
+  array_stream->release = NULL;
+}
+
+ArrowErrorCode ArrowDeviceBasicArrayStreamInit(
+    struct ArrowDeviceArrayStream* device_array_stream,
+    struct ArrowArrayStream* array_stream, struct ArrowDevice* device) {
+  struct ArrowBasicDeviceArrayStreamPrivate* private_data =
+      (struct ArrowBasicDeviceArrayStreamPrivate*)ArrowMalloc(
+          sizeof(struct ArrowBasicDeviceArrayStreamPrivate));
+  if (private_data == NULL) {
+    return ENOMEM;
+  }
+
+  private_data->device = device;
+  ArrowArrayStreamMove(array_stream, &private_data->naive_stream);
+
+  device_array_stream->device_type = device->device_type;
+  device_array_stream->get_schema = &ArrowDeviceBasicArrayStreamGetSchema;
+  device_array_stream->get_next = &ArrowDeviceBasicArrayStreamGetNext;
+  device_array_stream->get_last_error = &ArrowDeviceBasicArrayStreamGetLastError;
+  device_array_stream->release = &ArrowDeviceBasicArrayStreamRelease;
+  device_array_stream->private_data = private_data;
+  return NANOARROW_OK;
+}
+
+void ArrowDeviceArrayViewInit(struct ArrowDeviceArrayView* device_array_view) {
+  memset(device_array_view, 0, sizeof(struct ArrowDeviceArrayView));
+}
+
+void ArrowDeviceArrayViewReset(struct ArrowDeviceArrayView* device_array_view) {
+  ArrowArrayViewReset(&device_array_view->array_view);
+  device_array_view->device = NULL;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt32(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int32_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int32_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int32_t);
+  device_buffer_view.size_bytes = sizeof(int32_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceBufferGetInt64(struct ArrowDevice* device,
+                                                struct ArrowBufferView buffer_view,
+                                                int64_t i, int64_t* out) {
+  struct ArrowDeviceBufferView device_buffer_view;
+  void* sync_event = NULL;
+
+  struct ArrowDeviceBufferView out_view;
+  out_view.private_data = out;
+  out_view.offset_bytes = 0;
+  out_view.size_bytes = sizeof(int64_t);
+
+  device_buffer_view.private_data = buffer_view.data.data;
+  device_buffer_view.offset_bytes = i * sizeof(int64_t);
+  device_buffer_view.size_bytes = sizeof(int64_t);
+  NANOARROW_RETURN_NOT_OK(
+      ArrowDeviceBufferCopy(device, device_buffer_view, ArrowDeviceCpu(), out_view));
+  return NANOARROW_OK;
+}
+
+static ArrowErrorCode ArrowDeviceArrayViewValidateDefault(
+    struct ArrowDevice* device, struct ArrowArrayView* array_view) {
+  // Calculate buffer sizes or child lengths that require accessing the offsets
+  // buffer. Unlike the nanoarrow core default validation, this just checks the
+  // last buffer and doesn't set a nice error message (could implement those, too
+  // later on).
+  int64_t offset_plus_length = array_view->offset + array_view->length;
+  int32_t last_offset32;
+  int64_t last_offset64;
+
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset32;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_STRING:
+    case NANOARROW_TYPE_LARGE_BINARY:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+
+        // If the data buffer size is unknown, assign it; otherwise, check it
+        if (array_view->buffer_views[2].size_bytes == -1) {
+          array_view->buffer_views[2].size_bytes = last_offset64;
+        } else if (array_view->buffer_views[2].size_bytes < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LIST:
+    case NANOARROW_TYPE_MAP:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt32(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset32));
+        if (array_view->children[0]->length < last_offset32) {
+          return EINVAL;
+        }
+      }
+      break;
+
+    case NANOARROW_TYPE_LARGE_LIST:
+      if (array_view->buffer_views[1].size_bytes != 0) {
+        NANOARROW_RETURN_NOT_OK(ArrowDeviceBufferGetInt64(
+            device, array_view->buffer_views[1], offset_plus_length, &last_offset64));
+        if (array_view->children[0]->length < last_offset64) {
+          return EINVAL;
+        }
+      }
+      break;
+    default:
+      break;
+  }
+
+  // Recurse for children
+  for (int64_t i = 0; i < array_view->n_children; i++) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowDeviceArrayViewValidateDefault(device, array_view->children[i]));
+  }
+
+  return NANOARROW_OK;
+}
+
+ArrowErrorCode ArrowDeviceArrayViewSetArray(
+    struct ArrowDeviceArrayView* device_array_view, struct ArrowDeviceArray* device_array,
+    struct ArrowError* error) {
+  struct ArrowDevice* device =
+      ArrowDeviceResolve(device_array->device_type, device_array->device_id);
+  if (device == NULL) {
+    ArrowErrorSet(error, "Can't resolve device with type %d and identifier %ld",
+                  (int)device_array->device_type, (long)device_array->device_id);
+    return EINVAL;
+  }
+
+  // Wait on device_array to synchronize with the CPU
+  NANOARROW_RETURN_NOT_OK(device->synchronize_event(ArrowDeviceCpu(), device,
+                                                    device_array->sync_event, error));

Review Comment:
   all variants of `cudaMemcpy` are stream ordered so you can safely use them without synchronizing the stream, but the destination data is stream ordered as well, so if you're going to operate on it from a different stream or from host code then you need to synchronize the stream in some kind of way



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org