You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by er...@apache.org on 2023/12/05 17:33:30 UTC
(arrow-julia) branch main updated: enable field-order-agnostic overloads of `fromarrow` for struct types (#493)
This is an automated email from the ASF dual-hosted git repository.
ericphanson pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-julia.git
The following commit(s) were added to refs/heads/main by this push:
new 3712291 enable field-order-agnostic overloads of `fromarrow` for struct types (#493)
3712291 is described below
commit 37122911c24f44318e6d4a0840408adb3364cf2a
Author: Jarrett Revels <ja...@gmail.com>
AuthorDate: Tue Dec 5 12:33:24 2023 -0500
enable field-order-agnostic overloads of `fromarrow` for struct types (#493)
Motivated by
https://github.com/beacon-biosignals/Legolas.jl/issues/94#issuecomment-1837366852
Still requires:
- [x] docs
- [x] a test
- [x] a bit more due diligence benchmarking-wise. `@benchmark`ing the
access in the test case from
https://github.com/beacon-biosignals/Legolas.jl/issues/94 didn't reveal
any perf difference, which seems like a good sign
---------
Co-authored-by: Eric Hanson <58...@users.noreply.github.com>
---
Project.toml | 2 +-
src/ArrowTypes/Project.toml | 2 +-
src/ArrowTypes/src/ArrowTypes.jl | 8 +++++++-
src/arraytypes/struct.jl | 37 ++++++++++++++++++++++---------------
src/table.jl | 3 ++-
test/runtests.jl | 29 ++++++++++++++++++++++++++++-
6 files changed, 61 insertions(+), 20 deletions(-)
diff --git a/Project.toml b/Project.toml
index 80e6073..1e3ffe3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,7 +17,7 @@
name = "Arrow"
uuid = "69666777-d1a9-59fb-9406-91d4454c9d45"
authors = ["quinnj <qu...@gmail.com>"]
-version = "2.6.3"
+version = "2.7.0"
[deps]
ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
diff --git a/src/ArrowTypes/Project.toml b/src/ArrowTypes/Project.toml
index 95f411c..0166f60 100644
--- a/src/ArrowTypes/Project.toml
+++ b/src/ArrowTypes/Project.toml
@@ -18,7 +18,7 @@
name = "ArrowTypes"
uuid = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
authors = ["quinnj <qu...@gmail.com>"]
-version = "2.2.2"
+version = "2.3.0"
[deps]
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
diff --git a/src/ArrowTypes/src/ArrowTypes.jl b/src/ArrowTypes/src/ArrowTypes.jl
index bd67c5f..86183b5 100644
--- a/src/ArrowTypes/src/ArrowTypes.jl
+++ b/src/ArrowTypes/src/ArrowTypes.jl
@@ -170,11 +170,15 @@ overload is necessary.
A few `ArrowKind`s have/allow slightly more custom overloads for their `fromarrow` methods:
* `ListKind{true}`: for `String` types, they may overload `fromarrow(::Type{T}, ptr::Ptr{UInt8}, len::Int) = ...` to avoid
materializing a `String`
- * `StructKind`: must overload `fromarrow(::Type{T}, x...)` where individual fields are passed as separate
+ * `StructKind`:
+ * May overload `fromarrow(::Type{T}, x...)` where individual fields are passed as separate
positional arguments; so if my custom type `Interval` has two fields `first` and `last`, then I'd overload like
`ArrowTypes.fromarrow(::Type{Interval}, first, last) = ...`. Note the default implementation is
`ArrowTypes.fromarrow(::Type{T}, x...) = T(x...)`, so if your type already accepts all arguments in a constructor
no additional `fromarrow` method should be necessary (default struct constructors have this behavior).
+ * Alternatively, may overload `fromarrowstruct(::Type{T}, ::Val{fnames}, x...)`, where `fnames` is a tuple of the
+ field names corresponding to the values in `x`. This approach is useful when you need to implement deserialization
+ in a manner that is agnostic to the field order used by the serializer. When implemented, `fromarrowstruct` takes precedence over `fromarrow` in `StructKind` deserialization.
"""
function fromarrow end
fromarrow(::Type{T}, x::T) where {T} = x
@@ -302,6 +306,8 @@ struct StructKind <: ArrowKind end
ArrowKind(::Type{<:NamedTuple}) = StructKind()
+@inline fromarrowstruct(T::Type, ::Val, x...) = fromarrow(T, x...)
+
fromarrow(
::Type{NamedTuple{names,types}},
x::NamedTuple{names,types},
diff --git a/src/arraytypes/struct.jl b/src/arraytypes/struct.jl
index 4ad9752..510d1e4 100644
--- a/src/arraytypes/struct.jl
+++ b/src/arraytypes/struct.jl
@@ -19,7 +19,7 @@
An `ArrowVector` where each element is a "struct" of some kind with ordered, named fields, like a `NamedTuple{names, types}` or regular julia `struct`.
"""
-struct Struct{T,S} <: ArrowVector{T}
+struct Struct{T,S,fnames} <: ArrowVector{T}
validity::ValidityBitmap
data::S # Tuple of ArrowVector
ℓ::Int
@@ -33,23 +33,29 @@ isnamedtuple(T) = false
istuple(::Type{<:Tuple}) = true
istuple(T) = false
-@propagate_inbounds function Base.getindex(s::Struct{T,S}, i::Integer) where {T,S}
+if isdefined(ArrowTypes, :fromarrowstruct)
+ # https://github.com/apache/arrow-julia/pull/493
+ @inline function _fromarrowstruct(T::Type, v::Val, x...)
+ return ArrowTypes.fromarrowstruct(T, v, x...)
+ end
+else
+ @inline function _fromarrowstruct(T::Type, ::Val, x...)
+ return ArrowTypes.fromarrow(T, x...)
+ end
+end
+
+@propagate_inbounds function Base.getindex(
+ s::Struct{T,S,fnames},
+ i::Integer,
+) where {T,S,fnames}
@boundscheck checkbounds(s, i)
NT = Base.nonmissingtype(T)
+ NT !== T && (s.validity[i] || return missing)
+ vals = ntuple(j -> s.data[j][i], fieldcount(S))
if isnamedtuple(NT) || istuple(NT)
- if NT !== T
- return s.validity[i] ? NT(ntuple(j -> s.data[j][i], fieldcount(S))) : missing
- else
- return NT(ntuple(j -> s.data[j][i], fieldcount(S)))
- end
+ return NT(vals)
else
- if NT !== T
- return s.validity[i] ?
- ArrowTypes.fromarrow(NT, (s.data[j][i] for j = 1:fieldcount(S))...) :
- missing
- else
- return ArrowTypes.fromarrow(NT, (s.data[j][i] for j = 1:fieldcount(S))...)
- end
+ return _fromarrowstruct(NT, Val{fnames}(), vals...)
end
end
@@ -100,7 +106,8 @@ function arrowvector(::StructKind, x, i, nl, fi, de, ded, meta; kw...)
arrowvector(ToStruct(x, j), i, nl + 1, j, de, ded, nothing; kw...) for
j = 1:fieldcount(T)
)
- return Struct{withmissing(eltype(x), namedtupletype(T, data)),typeof(data)}(
+ NT = namedtupletype(T, data)
+ return Struct{withmissing(eltype(x), NT),typeof(data),fieldnames(NT)}(
validity,
data,
len,
diff --git a/src/table.jl b/src/table.jl
index 882a99b..ecd8b1d 100644
--- a/src/table.jl
+++ b/src/table.jl
@@ -840,7 +840,8 @@ function build(f::Meta.Field, L::Meta.Struct, batch, rb, de, nodeidx, bufferidx,
data = Tuple(vecs)
meta = buildmetadata(f.custom_metadata)
T = juliaeltype(f, meta, convert)
- return Struct{T,typeof(data)}(validity, data, len, meta), nodeidx, bufferidx
+ fnames = ntuple(i -> Symbol(f.children[i].name), length(f.children))
+ return Struct{T,typeof(data),fnames}(validity, data, len, meta), nodeidx, bufferidx
end
function build(f::Meta.Field, L::Meta.Union, batch, rb, de, nodeidx, bufferidx, convert)
diff --git a/test/runtests.jl b/test/runtests.jl
index 48ca399..ed288b3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1014,6 +1014,33 @@ end
# @test isequal(table.v, table2.v)
# end
-
+ if isdefined(ArrowTypes, :StructElement)
+ @testset "# 493" begin
+ # This test stresses the existence of the mechanism
+ # implemented in https://github.com/apache/arrow-julia/pull/493,
+ # but doesn't stress the actual use case that motivates
+ # that mechanism, simply because it'd be more annoying to
+ # write that test; see the PR for details.
+ struct Foo493
+ x::Int
+ y::Int
+ end
+ ArrowTypes.arrowname(::Type{Foo493}) = Symbol("JuliaLang.Foo493")
+ ArrowTypes.JuliaType(::Val{Symbol("JuliaLang.Foo493")}, T) = Foo493
+ function ArrowTypes.fromarrowstruct(
+ ::Type{Foo493},
+ ::Val{fnames},
+ x...,
+ ) where {fnames}
+ nt = NamedTuple{fnames}(x)
+ return Foo493(nt.x + 1, nt.y + 1)
+ end
+ t = (; f=[Foo493(1, 2), Foo493(3, 4)])
+ buf = Arrow.tobuffer(t)
+ tbl = Arrow.Table(buf)
+ @test tbl.f[1] === Foo493(2, 3)
+ @test tbl.f[2] === Foo493(4, 5)
+ end
+ end
end # @testset "misc"
end