You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by qu...@apache.org on 2023/06/02 17:11:45 UTC

[arrow-julia] branch main updated: Return SubArrays when possible for arrow list types (#446)

This is an automated email from the ASF dual-hosted git repository.

quinnj pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-julia.git


The following commit(s) were added to refs/heads/main by this push:
     new e295928  Return SubArrays when possible for arrow list types (#446)
e295928 is described below

commit e295928b879b65180f89fb47fb3bbcfcb4053045
Author: Jacob Quinn <qu...@gmail.com>
AuthorDate: Fri Jun 2 11:11:34 2023 -0600

    Return SubArrays when possible for arrow list types (#446)
---
 src/append.jl     |  8 +++++++-
 src/table.jl      | 12 ++++++++++--
 test/arrowjson.jl |  2 +-
 test/runtests.jl  |  2 +-
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/append.jl b/src/append.jl
index fdcd5e5..db7f1d3 100644
--- a/src/append.jl
+++ b/src/append.jl
@@ -197,7 +197,13 @@ end
 function is_equivalent_schema(sch1::Tables.Schema, sch2::Tables.Schema)
     (sch1.names == sch2.names) || (return false)
     for (t1,t2) in zip(sch1.types, sch2.types)
-        (t1 === t2) || (return false)
+        tt1 = Base.nonmissingtype(t1)
+        tt2 = Base.nonmissingtype(t2)
+        if t1 == t2 || (tt1 <: AbstractVector && tt2 <: AbstractVector && eltype(tt1) == eltype(tt2))
+            continue
+        else
+            return false
+        end
     end
     true
 end
diff --git a/src/table.jl b/src/table.jl
index da23038..50bcd3b 100644
--- a/src/table.jl
+++ b/src/table.jl
@@ -625,6 +625,8 @@ function reinterp(::Type{T}, batch, buf, compression) where {T}
     end
 end
 
+const SubVector{T, P} = SubArray{T, 1, P, Tuple{UnitRange{Int64}}, true}
+
 function build(f::Meta.Field, L::ListTypes, batch, rb, de, nodeidx, bufferidx, convert)
     @debugv 2 "building array: L = $L"
     validity = buildbitmap(batch, rb, nodeidx, bufferidx)
@@ -637,6 +639,8 @@ function build(f::Meta.Field, L::ListTypes, batch, rb, de, nodeidx, bufferidx, c
     bufferidx += 1
     len = rb.nodes[nodeidx].length
     nodeidx += 1
+    meta = buildmetadata(f.custom_metadata)
+    T = juliaeltype(f, meta, convert)
     if L isa Meta.Utf8 || L isa Meta.LargeUtf8 || L isa Meta.Binary || L isa Meta.LargeBinary
         buffer = rb.buffers[bufferidx]
         bytes, A = reinterp(UInt8, batch, buffer, rb.compression)
@@ -644,9 +648,13 @@ function build(f::Meta.Field, L::ListTypes, batch, rb, de, nodeidx, bufferidx, c
     else
         bytes = UInt8[]
         A, nodeidx, bufferidx = build(f.children[1], batch, rb, de, nodeidx, bufferidx, convert)
+        # juliaeltype returns Vector for List, translate to SubArray
+        S = Base.nonmissingtype(T)
+        if S <: Vector
+            ST = SubVector{eltype(S), typeof(A)}
+            T = S == T ? ST : Union{Missing, ST}
+        end
     end
-    meta = buildmetadata(f.custom_metadata)
-    T = juliaeltype(f, meta, convert)
     return List{T, OT, typeof(A)}(bytes, validity, offsets, A, len, meta), nodeidx, bufferidx
 end
 
diff --git a/test/arrowjson.jl b/test/arrowjson.jl
index 586ab93..7cd7f45 100644
--- a/test/arrowjson.jl
+++ b/test/arrowjson.jl
@@ -596,7 +596,7 @@ function DataFile(source)
 end
 
 function Base.isequal(df::DataFile, tbl::Arrow.Table)
-    Tables.schema(df) == Tables.schema(tbl) || return false
+    Arrow.is_equivalent_schema(Tables.schema(df), Tables.schema(tbl)) || return false
     i = 1
     for (col1, col2) in zip(Tables.Columns(df), Tables.Columns(tbl))
         if !isequal(col1, col2)
diff --git a/test/runtests.jl b/test/runtests.jl
index dfac2c2..12c826c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -415,7 +415,7 @@ t = (
     col1=[["boop", "she"], ["boop", "she"], ["boo"]],
 )
 tbl = Arrow.Table(Arrow.tobuffer(t))
-@test eltype(tbl.col1) == Vector{String}
+@test eltype(tbl.col1) <: AbstractVector{String}
 end
 
 @testset "# 200 VersionNumber" begin